[PATCH RFC] releasenotes: add similarity check function to compare incoming notes

Fri Jun 23 16:37:22 UTC 2017

# HG changeset patch
# User Rishabh Madan <rishabhmadan96 at gmail.com>
# Date 1498235213 -7200
#      Fri Jun 23 18:26:53 2017 +0200
# Node ID a90693382178ca82b2918ee4b159dfb490d1bfc8
# Parent  b6e6d8df88beb042f5a37123a0ea6a9b437f7755
releasenotes: add similarity check function to compare incoming notes

It is possible that the incoming note fragments might have some similar content
as the existing release notes. In case of a bug fix, we match for issueNNNN in $
existing notes. For other general cases, it makes use of fuzzywuzzy library to
get a similarity score. If the score is above a certain threshold, we ignore
the fragment otherwise add it. But the score might be misleading for small comm$
messages. So, it uses similarity function if only the length of string (in word$
is above a certain number. The patch also adds tests related to its usage.
But it needs improvement in the sense of combining the incoming notes. We can
use interactive mode for adding the notes. Maybe we can do this if similarity
score is under a certain range.

diff -r b6e6d8df88be -r a90693382178 hgext/releasenotes.py

--- a/hgext/releasenotes.py	Fri Jun 23 17:15:53 2017 +0200
+++ b/hgext/releasenotes.py	Fri Jun 23 18:26:53 2017 +0200
@@ -12,6 +12,7 @@
 """
 
 from __future__ import absolute_import
+from fuzzywuzzy import fuzz
 
 import errno
 import re
@@ -44,6 +45,7 @@
 ]
 
 RE_DIRECTIVE = re.compile('^\.\. ([a-zA-Z0-9_]+)::\s*([^$]+)?$')
+RE_ISSUE = r'\bissue [0-9]{4,6}(?![0-9])\b|\bissue[0-9]{4,6}(?![0-9])\b'
 
 BULLET_SECTION = _('Other Changes')
 
@@ -89,7 +91,20 @@
 
         This is used to combine multiple sources of release notes together.
         """
+
+        all_points = []
+
         for section in other:
+            for title, paragraphs in self.titledforsection(section):
+                str = ""
+                str = converttostring(paragraphs)
+                all_points.append(str)
+
+            for paragraphs in self.nontitledforsection(section):
+                str = ""
+                str = converttostring(paragraphs)
+                all_points.append(str)
+
             for title, paragraphs in other.titledforsection(section):
                 if self.hastitledinsection(section, title):
                     # TODO prompt for resolution if different and running in
@@ -97,18 +112,59 @@
                     ui.write(_('%s already exists in %s section; ignoring\n') %
                              (title, section))
                     continue
+                str_incoming = converttostring(paragraphs)
+                if section == 'fix':
+                    issues = re.findall(RE_ISSUE, str_incoming, re.IGNORECASE)
+                    if len(issues) > 0:
+                        issuenumber = issues[0]
+                        issuenumber = "".join(issuenumber.split())
+                        if any(issuenumber in s for s in all_points):
+                            ui.write(_("\"%s\" already exists in notes; "
+                                     "ignoring\n") % issuenumber)
+                            continue
+                        else:
+                            self.addtitleditem(section, title, paragraphs)
+                            continue
 
-                # TODO perform similarity comparison and try to match against
-                # existing.
-                self.addtitleditem(section, title, paragraphs)
+                if len(str_incoming.split()) > 10:
+                    merge = similaritycheck(str_incoming, all_points)
+
+                    if not merge:
+                        ui.write(_("\"%s\" already exists in notes file; "
+                                 "ignoring\n") % str_incoming)
+                    else:
+                        self.addtitleditem(section, title, paragraphs)
+                else:
+                    self.addtitleditem(section, title, paragraphs)
 
             for paragraphs in other.nontitledforsection(section):
-                if paragraphs in self.nontitledforsection(section):
-                    continue
+                str_incoming = converttostring(paragraphs)
+                if section == 'fix':
+                    issues = re.findall(RE_ISSUE, str_incoming, re.IGNORECASE)
+                    if len(issues) > 0:
+                        issuenumber = issues[0].lower()
+                        issuenumber = "".join(issuenumber.split())
+                        if any(issuenumber in s for s in all_points):
+                            ui.write(_("\"%s\" already exists in notes; "
+                                     "ignoring\n") % str_incoming)
+                            continue
+                        else:
+                            self.addnontitleditem(section, paragraphs)
+                            continue
 
-                # TODO perform similarily comparison and try to match against
-                # existing.
-                self.addnontitleditem(section, paragraphs)
+                if paragraphs in self.nontitledforsection(section):
+                        continue
+
+                if len(str_incoming.split()) > 10:
+                    merge = similaritycheck(str_incoming, all_points)
+
+                    if not merge:
+                        ui.write(_("\"%s\" already exists in notes; "
+                                 "ignoring\n") % str_incoming)
+                    else:
+                        self.addnontitleditem(section, paragraphs)
+                else:
+                    self.addnontitleditem(section, paragraphs)
 
 class releasenotessections(object):
     def __init__(self, ui):
@@ -128,6 +184,27 @@
 
         return None
 
+def converttostring(paragraphs):
+    """
+    Converts paragraph and bullet data to individual strings.
+    """
+    str = ""
+    for para in paragraphs:
+        str += ' '.join(para) + ' '
+    return str
+
+def similaritycheck(incoming_str, existingnotes):
+    """
+    Returns true when note fragment can be merged to existing notes.
+    """
+    merge = True
+    for bullet in existingnotes:
+        score = fuzz.token_set_ratio(incoming_str, bullet)
+        if score > 75:
+            merge = False
+            break
+    return merge
+
 def parsenotesfromrevisions(repo, directives, revs):
     notes = parsedreleasenotes()