From 9501108af46b65e246d4d9b84edfdd7bc6a825b6 Mon Sep 17 00:00:00 2001
From: Sean Dague <sean.dague@samsung.com>
Date: Tue, 17 Dec 2013 10:48:49 -0500
Subject: [PATCH] add support for classification rate

this adds a bit to the end of the report for the classification
rate that we currently have, as well as the jobs with the most
still unclassified errors.

Change-Id: Ia1426bef2d788f8b2c14e5a1402a5ac01a3561f7
---
 elastic_recheck/cmd/check_success.py | 80 ++++++++++++++++++++++++++--
 1 file changed, 77 insertions(+), 3 deletions(-)

diff --git a/elastic_recheck/cmd/check_success.py b/elastic_recheck/cmd/check_success.py
index 8610d207..b4e6bb0c 100755
--- a/elastic_recheck/cmd/check_success.py
+++ b/elastic_recheck/cmd/check_success.py
@@ -15,7 +15,9 @@
 #    under the License.
 
 import argparse
+import operator
 import os
+import re
 
 from launchpadlib import launchpad
 
@@ -33,25 +35,95 @@ def get_options():
     parser.add_argument('--lp', '-l', help="Query Launchpad",
                         type=bool,
                         default=False)
+    parser.add_argument('--rate', '-r', help="Classification rate",
+                        type=bool,
+                        default=True)
     return parser.parse_args()
 
 
+def all_fails(classifier):
+    """Find all the the fails in the integrated gate.
+
+    This attempts to find all the build jobs in the integrated gate
+    so we can figure out how good we are doing on total classification.
+    """
+    all_fails = {}
+    query = ('filename:"console.html" '
+             'AND message:"Finished: FAILURE" '
+             'AND build_queue:"gate"')
+    results = classifier.hits_by_query(query, size=30000)
+    facets = er_results.FacetSet()
+    facets.detect_facets(results, ["build_uuid"])
+    for build in facets:
+        for result in facets[build]:
+            # not perfect, but basically an attempt to show the integrated
+            # gate. Would be nice if there was a zuul attr for this in es.
+            if re.search("(^openstack/|devstack|grenade)", result.project):
+                all_fails["%s.%s" % (build, result.build_name)] = False
+    return all_fails
+
+
+def classifying_rate(classifier, data):
+    """Builds and prints the classification rate.
+
+    It's important to know how good a job we are doing, so this
+    tool runs through all the failures we've got and builds the
+    classification rate. For every failure in the gate queue did
+    we find a match for it.
+    """
+    fails = all_fails(classifier)
+    for bugnum in data:
+        bug = data[bugnum]
+        for job in bug['failed_jobs']:
+            fails[job] = True
+
+    total = len(fails.keys())
+    bad_jobs = {}
+    count = 0
+    for f in fails:
+        if fails[f] is True:
+            count += 1
+        else:
+            build, job = f.split('.', 1)
+            if job in bad_jobs:
+                bad_jobs[job] += 1
+            else:
+                bad_jobs[job] = 1
+
+    print("Classification percentage: %2.2f%%" %
+          ((float(count) / float(total)) * 100.0))
+    sort = sorted(
+        bad_jobs.iteritems(),
+        key=operator.itemgetter(1),
+        reverse=True)
+    print("Job fails with most unclassified errors")
+    for s in sort:
+        print "  %3s : %s" % (s[1], s[0])
+
+
 def collect_metrics(classifier):
     data = {}
     for q in classifier.queries:
         results = classifier.hits_by_query(q['query'], size=30000)
         facets = er_results.FacetSet()
-        facets.detect_facets(results, ["build_status", "build_uuid"])
+        facets.detect_facets(
+            results,
+            ["build_status", "build_uuid"])
 
         num_fails = 0
+        failed_jobs = []
         if "FAILURE" in facets:
             num_fails = len(facets["FAILURE"])
+            for build in facets["FAILURE"]:
+                for result in facets["FAILURE"][build]:
+                    failed_jobs.append("%s.%s" % (build, result.build_name))
 
         data[q['bug']] = {
             'fails': num_fails,
             'hits': facets,
-            'query': q['query']
-            }
+            'query': q['query'],
+            'failed_jobs': failed_jobs
+        }
 
     return data
 
@@ -90,6 +162,8 @@ def main():
     classifier = er.Classifier(opts.dir)
     data = collect_metrics(classifier)
     print_metrics(data, with_lp=opts.lp)
+    if opts.rate:
+        classifying_rate(classifier, data)
 
 
 if __name__ == "__main__":