[PATCH 1 of 3] convert: cvsps.py - code to generate changesets from a CVS repository

Thu Apr 24 13:49:00 CDT 2008

# HG changeset patch
# User Frank Kingswood <frank at kingswood-consulting.co.uk>
# Date 1209062044 -3600
# Node ID a8bfd9e3e102f14d2db6242ca11a7a022601af63
# Parent  626cb86a6523c9e8b453719314dd31fa4d61ced3
convert: cvsps.py - code to generate changesets from a CVS repository

diff -r 626cb86a6523 -r a8bfd9e3e102 hgext/convert/cvsps.py

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hgext/convert/cvsps.py	Thu Apr 24 19:34:04 2008 +0100
@@ -0,0 +1,525 @@
+#!/usr/bin/env python
+#
+# Mercurial built-in replacement for cvsps.
+#
+# Copyright 2008, Frank Kingswood <frank at kingswood-consulting.co.uk>
+#
+# This software may be used and distributed according to the terms
+# of the GNU General Public License, incorporated herein by reference.
+
+import os
+import re
+import sys
+import cPickle as pickle
+from mercurial import util
+from mercurial.i18n import _
+
+def listsort(list,key):
+   "helper to sort by key in Python 2.3"
+   try:
+      list.sort(key=key)
+   except TypeError:
+      list.sort(lambda l,r:cmp(key(l),key(r)))
+
+class cvsps_log_entry(object):
+   '''Class cvsps_log_entry has the following attributes:
+      .Author    - author name as CVS knows it
+      .Branch    - name of branch this revision is on
+      .Branches  - revision tuple of branches starting at this revision
+      .Comment   - commit message
+      .Date      - the commit date as a (time,tz) tuple
+      .Dead      - true if file revision is dead
+      .File      - Name of file
+      .Lines     - a tuple (+lines,-lines) or None
+      .Parent    - Previous revision of this entry
+      .RCS       - name of file as returned from CVS
+      .Revision  - revision number as tuple
+      .Tags      - list of tags on the file
+   '''
+   def __init__(self,**entries):
+      self.__dict__.update(entries)
+
+class cvsps_log_error(Exception):
+   pass
+
+def cvsps_create_log(ui,directory=None,root=None,rlog=True,cache=None):
+   '''Collect the CVS rlog'''
+
+   # reusing strings typically saves about 40% of memory
+   _scache={}
+   def scache(s):
+      "return a shared version of a string"
+      try:
+         return _scache[s]
+      except:
+         _scache[s]=s
+      return s
+
+   ui.status(_('collecting CVS rlog\n'))
+
+   log=[]      # list of cvsps_log_entry objects containing the CVS state
+
+   # patterns to match in CVS (r)log output, by state of use
+   re_00=re.compile('RCS file: (.+)$')
+   re_01=re.compile('cvs \\[r?log aborted\\]: (.+)$')
+   re_02=re.compile('cvs (r?log|server): (.+)\n$')
+   re_03=re.compile("(Cannot access.+CVSROOT)|(can't create temporary directory.+)$")
+   re_10=re.compile('Working file: (.+)$')
+   re_20=re.compile('symbolic names:')
+   re_30=re.compile('\t(.+): ([\\d.]+)$')
+   re_31=re.compile('----------------------------$')
+   re_32=re.compile('=============================================================================$')
+   re_50=re.compile('revision ([\\d.]+)(\s+locked by:\s+.+;)?$')
+   re_60=re.compile(r'date:\s+(.+);\s+author:\s+(.+);\s+state:\s+(.+?);(\s+lines:\s+(\+\d+)?\s+(-\d+)?;)?')
+   re_70=re.compile('branches: (.+);$')
+
+   prefix=''   # leading path to strip of what we get from CVS
+
+   if directory is None:
+      # Current working directory
+
+      # Get the real directory in the repository
+      try:
+         prefix=directory=file('CVS/Repository').read().strip()
+      except IOError:
+         raise cvsps_log_error('Not a CVS sandbox')
+
+      if not prefix.endswith('/'):
+         prefix+='/'
+
+      # Use the Root file in the sandbox, if it exists
+      try:
+         root=file('CVS/Root').read().strip()
+      except IOError:
+         pass
+
+   if not root:
+      root=os.environ.get('CVSROOT',None)
+
+   # read log cache if one exists
+   oldlog=[]
+   date=None
+
+   if cache:
+      cachedir=os.path.expanduser('~/.hg.cvsps')
+      if not os.path.exists(cachedir):
+         os.mkdir(cachedir)
+      cachefile=(root or "").split(":")+[directory,"cache"]
+      cachefile=['-'.join(re.findall(r'\w+',s)) for s in cachefile if s]
+      cachefile=os.path.join(cachedir,'.'.join(cachefile))
+
+   if cache=='update':
+      try:
+         ui.debug(_('reading cvs log cache %s\n')%cachefile)
+         oldlog=pickle.load(file(cachefile))
+         ui.debug(_('cache has %d log entries\n')%len(oldlog))
+      except Exception,e:
+         ui.debug(_('error reading cache: %r\n')%e)
+
+      if oldlog:
+         date=oldlog[-1].Date    # last commit date as a (time,tz) tuple
+         date=util.datestr(date,'%Y/%m/%d %H:%M:%S %1%2')
+
+   # build the CVS commandline
+   cmd=['cvs','-q']
+   if root:
+      cmd.append('-d%s'%root)
+      p=root.split(':')[-1]
+      if not p.endswith('/'):
+         p+='/'
+      prefix=p+prefix
+   cmd.append(['log','rlog'][rlog])
+   if date:
+      # no space between option and date string
+      cmd.append('-d>%s'%date)
+   cmd.append(directory)
+
+   # state machine begins here
+   tags={}     # dictionary of revisions on current file with their tags
+   state=0
+   store=False # set when a new record can be appended
+
+   cmd=[util.shellquote(arg) for arg in cmd]
+
+   for line in util.popen(' '.join(cmd)):
+      if line.endswith('\n'):
+         line=line[:-1]
+      #ui.debug('state=%d line=%r\n'%(state,line))
+
+      if state==0:
+         match=re_00.match(line)
+         if match:
+            rcs=match.group(1)
+            tags={}
+            if rlog:
+               filename=rcs[:-2]
+               if filename.startswith(prefix):
+                  filename=filename[len(prefix):]
+               if filename.startswith('/'):
+                  filename=filename[1:]
+               if filename.startswith('Attic/'):
+                  filename=filename[6:]
+               else:
+                  filename=filename.replace('/Attic/','/')
+               state=2
+               continue
+            state=1
+            continue
+         match=re_01.match(line)
+         if match:
+            raise Exception(match.group(1))
+         match=re_02.match(line)
+         if match:
+            raise Exception(match.group(2))
+         if re_03.match(line):
+            raise Exception(line)
+
+      elif state==1:
+         match=re_10.match(line)
+         assert match,_('RCS file must be followed by working file')
+         filename=match.group(1)
+         state=2
+
+      elif state==2:
+         if re_20.match(line):
+            state=3
+
+      elif state==3:
+         match=re_30.match(line)
+         if match:
+            rev=[int(x) for x in match.group(2).split('.')]
+
+            # Convert magic branch number to an odd-numbered one
+            revn=len(rev)
+            if revn>3 and (revn%2)==0 and rev[-2]==0:
+               rev=rev[:-2]+rev[-1:]
+            rev=tuple(rev)
+
+            if rev not in tags:
+               tags[rev]=[]
+            tags[rev].append(match.group(1))
+
+         elif re_31.match(line):
+            state=5
+         elif re_32.match(line):
+            state=0
+
+      elif state==4:
+         if re_31.match(line):
+            state=5
+         else:
+            assert not re_32.match(line),_('Must have at least some revisions')
+
+      elif state==5:
+         match=re_50.match(line)
+         assert match,_('expected revision number')
+         e=cvsps_log_entry(RCS=scache(rcs),File=scache(filename),
+               Revision=tuple([int(x) for x in match.group(1).split('.')]),
+               Branches=[],Parent=None)
+         state=6
+
+      elif state==6:
+         match=re_60.match(line)
+         assert match,_('revision must be followed by date line')
+         d=match.group(1)
+         if d[2]=='/':
+            # Y2K
+            d='19'+d
+
+         if len(d.split())!=3:
+            # cvs log dates always in GMT
+            d=d+' UTC'
+         e.Date=util.parsedate(d,['%y/%m/%d %H:%M:%S','%Y/%m/%d %H:%M:%S','%Y-%m-%d %H:%M:%S'])
+         e.Author=scache(match.group(2))
+         e.Dead=match.group(3).lower()=='dead'
+
+         if match.group(5):
+            if match.group(6):
+               e.Lines=(int(match.group(5)),int(match.group(6)))
+            else:
+               e.Lines=(int(match.group(5)),0)
+         elif match.group(6):
+            e.Lines=(0,int(match.group(6)))
+         else:
+            e.Lines=None
+         e.Comment=[]
+         state=7
+
+      elif state==7:
+         m=re_70.match(line)
+         if m:
+            e.Branches=[tuple([int(y) for y in x.strip().split('.')])
+                        for x in m.group(1).split(';')]
+            state=8
+         elif re_31.match(line):
+            state=5
+            store=True
+         elif re_32.match(line):
+            state=0
+            store=True
+         else:
+            e.Comment.append(line)
+
+      elif state==8:
+         if re_31.match(line):
+            state=5
+            store=True
+         elif re_32.match(line):
+            state=0
+            store=True
+         else:
+            e.Comment.append(line)
+
+      if store:
+         store=False
+         e.Tags=[scache(x) for x in tags.get(e.Revision,[])]
+         e.Tags.sort()
+         e.Comment=scache('\n'.join(e.Comment))
+
+         revn=len(e.Revision)
+         if revn>3 and (revn%2)==0:
+            e.Branch=tags.get(e.Revision[:-1],[None])[0]
+         else:
+            e.Branch=None
+
+         log.append(e)
+
+         if len(log)%100==0:
+            ui.status(util.ellipsis('%d %s'%(len(log),e.File),80)+'\n')
+
+   listsort(log,key=lambda x:(x.RCS,x.Revision))
+
+   # find parent revisions of individual files
+   versions={}
+   for e in log:
+      branch=e.Revision[:-1]
+      p=versions.get((e.RCS,branch),None)
+      if p is None:
+         p=e.Revision[:-2]
+      e.Parent=p
+      versions[(e.RCS,branch)]=e.Revision
+
+   # update the log cache
+   if cache:
+      if log:
+         # join up the old and new logs
+         listsort(log,key=lambda x:x.Date)
+
+         if oldlog and oldlog[-1].Date>=log[0].Date:
+            raise cvsps_log_error('Log cache overlaps with new log entries, re-run without cache.')
+
+         log=oldlog+log
+
+         # write the new cachefile
+         ui.debug(_('writing cvs log cache %s\n')%cachefile)
+         pickle.dump(log,file(cachefile,'w'))
+      else:
+         log=oldlog
+
+   ui.status(_('%d log entries\n')%len(log))
+
+   return log
+
+
+class cvsps_changeset(object):
+   '''Class cvsps_changeset has the following attributes:
+      .Author    - author name as CVS knows it
+      .Branch    - name of branch this changeset is on, or None
+      .Comment   - commit message
+      .Date      - the commit date as a (time,tz) tuple
+      .Entries   - list of cvsps_log_entry objects in this changeset
+      .Parent    - list of one or two parent changesets
+      .Tags      - list of tags on this changeset
+   '''
+   def __init__(self,**entries):
+      self.__dict__.update(entries)
+
+def cvsps_create_changeset(ui,log,fuzz=60,mergefrom=None,mergeto=None):
+   '''Convert log into changesets.'''
+
+   ui.status(_('creating changesets\n'))
+
+   # Merge changesets
+
+   listsort(log,key=lambda x:(x.Comment,x.Author,x.Branch,x.Date))
+
+   changeset=[]
+   files={}
+   c=None
+   for i,e in enumerate(log):
+
+      # Check if log entry belongs to the current changeset or not.
+      if not (c and
+              e.Comment==c.Comment and
+              e.Author==c.Author and
+              e.Branch==c.Branch and
+              (c.Date[0]+c.Date[1])<=(e.Date[0]+e.Date[1])<=(c.Date[0]+c.Date[1])+fuzz and
+              e.File not in files):
+         c=cvsps_changeset(Comment=e.Comment,Author=e.Author,
+                           Branch=e.Branch,Date=e.Date,Entries=[])
+         changeset.append(c)
+         files={}
+         if len(changeset)%100==0:
+            ui.status(util.ellipsis('%d %s'%(len(changeset),repr(e.Comment)[1:-1]),80)+'\n')
+
+      e.Changeset=c
+      c.Entries.append(e)
+      files[e.File]=True
+      c.Date=e.Date       # changeset date is date of latest commit in it
+
+   # Sort files in each changeset
+
+   for c in changeset:
+      def pathcompare(l,r):
+         'Mimic cvsps sorting order'
+         l=l.split('/')
+         r=r.split('/')
+         nl=len(l)
+         nr=len(r)
+         n=min(nl,nr)
+         for i in range(n):
+            if i+1==nl and nl<nr:
+               return -1
+            elif i+1==nr and nl>nr:
+               return +1
+            elif l[i]<r[i]:
+               return -1
+            elif l[i]>r[i]:
+               return +1
+         return 0
+      def entitycompare(l,r):
+         return pathcompare(l.File,r.File)
+
+      c.Entries.sort(entitycompare)
+
+   # Sort changesets by date
+
+   def cscmp(l,r):
+      d=sum(l.Date)-sum(r.Date)
+      if d:
+         return d
+
+      # detect vendor branches and initial commits on a branch
+      le={}
+      for e in l.Entries:
+         le[e.RCS]=e.Revision
+      re={}
+      for e in r.Entries:
+         re[e.RCS]=e.Revision
+
+      d=0
+      for e in l.Entries:
+         if re.get(e.RCS,None)==e.Parent:
+            assert not d
+            d=1
+            break
+
+      for e in r.Entries:
+         if le.get(e.RCS,None)==e.Parent:
+            assert not d
+            d=-1
+            break
+
+      return d
+
+   changeset.sort(cscmp)
+
+   # Collect tags
+
+   globaltags={}
+   for c in changeset:
+      tags={}
+      for e in c.Entries:
+         for tag in e.Tags:
+            # remember which is the latest changeset to have this tag
+            globaltags[tag]=c
+
+   for c in changeset:
+      tags={}
+      for e in c.Entries:
+         for tag in e.Tags:
+            tags[tag]=True
+      # remember tags only if this is the latest changeset to have it
+      tagnames=[tag for tag in tags if globaltags[tag] is c]
+      tagnames.sort()
+      c.Tags=tagnames
+
+   # Find parent changesets, handle {{mergetobranch BRANCHNAME}}
+   # by inserting dummy changesets with two parents, and handle
+   # {{mergefrombranch BRANCHNAME}} by setting two parents.
+
+   if mergeto is None:
+      mergeto=r'{{mergetobranch ([-\w]+)}}'
+   if mergeto:
+      mergeto=re.compile(mergeto)
+
+   if mergefrom is None:
+      mergefrom=r'{{mergefrombranch ([-\w]+)}}'
+   if mergefrom:
+      mergefrom=re.compile(mergefrom)
+
+   versions={}    # changeset index where we saw any particular file version
+   branches={}    # changeset index where we saw a branch
+   n=len(changeset)
+   i=0
+   while i<n:
+      c=changeset[i]
+
+      for f in c.Entries:
+         versions[(f.RCS,f.Revision)]=i
+
+      p=None
+      if c.Branch in branches:
+         p=branches[c.Branch]
+      else:
+         for f in c.Entries:
+            p=max(p,versions.get((f.RCS,f.Parent),None))
+
+      c.Parents=[]
+      if p is not None:
+         c.Parents.append(changeset[p])
+
+      if mergefrom:
+         m=mergefrom.search(c.Comment)
+         if m:
+            m=m.group(1)
+            if m=='HEAD':
+               m=None
+            if m in branches and c.Branch!=m:
+               c.Parents.append(changeset[branches[m]])
+
+      if mergeto:
+         m=mergeto.search(c.Comment)
+         if m:
+            try:
+               m=m.group(1)
+               if m=='HEAD':
+                  m=None
+            except:
+               m=None   # if no group found then merge to HEAD
+            if m in branches and c.Branch!=m:
+               # insert empty changeset for merge
+               cc=cvsps_changeset(Author=c.Author,Branch=m,Date=c.Date,
+                     Comment='convert-repo: CVS merge from branch %s'%c.Branch,
+                     Entries=[],Tags=[],Parents=[changeset[branches[m]],c])
+               changeset.insert(i+1,cc)
+               branches[m]=i+1
+
+               # adjust our loop counters now we have inserted a new entry
+               n+=1
+               i+=2
+               continue
+
+      branches[c.Branch]=i
+      i+=1
+
+   # Number changesets
+
+   for i,c in enumerate(changeset):
+      c.Id=i+1
+
+   ui.status(_('%d changeset entries\n')%len(changeset))
+
+   return changeset
+
+# EOF cvsps.py