[PATCH 1 of 3] convert: cvsps.py - code to generate changesets from a CVS repository
Frank Kingswood
frank at kingswood-consulting.co.uk
Thu Apr 24 13:49:00 CDT 2008
# HG changeset patch
# User Frank Kingswood <frank at kingswood-consulting.co.uk>
# Date 1209062044 -3600
# Node ID a8bfd9e3e102f14d2db6242ca11a7a022601af63
# Parent 626cb86a6523c9e8b453719314dd31fa4d61ced3
convert: cvsps.py - code to generate changesets from a CVS repository
diff -r 626cb86a6523 -r a8bfd9e3e102 hgext/convert/cvsps.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/hgext/convert/cvsps.py Thu Apr 24 19:34:04 2008 +0100
@@ -0,0 +1,525 @@
+#!/usr/bin/env python
+#
+# Mercurial built-in replacement for cvsps.
+#
+# Copyright 2008, Frank Kingswood <frank at kingswood-consulting.co.uk>
+#
+# This software may be used and distributed according to the terms
+# of the GNU General Public License, incorporated herein by reference.
+
+import os
+import re
+import sys
+import cPickle as pickle
+from mercurial import util
+from mercurial.i18n import _
+
+def listsort(list,key):
+ "helper to sort by key in Python 2.3"
+ try:
+ list.sort(key=key)
+ except TypeError:
+ list.sort(lambda l,r:cmp(key(l),key(r)))
+
+class cvsps_log_entry(object):
+ '''Class cvsps_log_entry has the following attributes:
+ .Author - author name as CVS knows it
+ .Branch - name of branch this revision is on
+ .Branches - revision tuple of branches starting at this revision
+ .Comment - commit message
+ .Date - the commit date as a (time,tz) tuple
+ .Dead - true if file revision is dead
+ .File - Name of file
+ .Lines - a tuple (+lines,-lines) or None
+ .Parent - Previous revision of this entry
+ .RCS - name of file as returned from CVS
+ .Revision - revision number as tuple
+ .Tags - list of tags on the file
+ '''
+ def __init__(self,**entries):
+ self.__dict__.update(entries)
+
+class cvsps_log_error(Exception):
+ pass
+
+def cvsps_create_log(ui,directory=None,root=None,rlog=True,cache=None):
+ '''Collect the CVS rlog'''
+
+ # reusing strings typically saves about 40% of memory
+ _scache={}
+ def scache(s):
+ "return a shared version of a string"
+ try:
+ return _scache[s]
+ except:
+ _scache[s]=s
+ return s
+
+ ui.status(_('collecting CVS rlog\n'))
+
+ log=[] # list of cvsps_log_entry objects containing the CVS state
+
+ # patterns to match in CVS (r)log output, by state of use
+ re_00=re.compile('RCS file: (.+)$')
+ re_01=re.compile('cvs \\[r?log aborted\\]: (.+)$')
+ re_02=re.compile('cvs (r?log|server): (.+)\n$')
+ re_03=re.compile("(Cannot access.+CVSROOT)|(can't create temporary directory.+)$")
+ re_10=re.compile('Working file: (.+)$')
+ re_20=re.compile('symbolic names:')
+ re_30=re.compile('\t(.+): ([\\d.]+)$')
+ re_31=re.compile('----------------------------$')
+ re_32=re.compile('=============================================================================$')
+ re_50=re.compile('revision ([\\d.]+)(\s+locked by:\s+.+;)?$')
+ re_60=re.compile(r'date:\s+(.+);\s+author:\s+(.+);\s+state:\s+(.+?);(\s+lines:\s+(\+\d+)?\s+(-\d+)?;)?')
+ re_70=re.compile('branches: (.+);$')
+
+ prefix='' # leading path to strip of what we get from CVS
+
+ if directory is None:
+ # Current working directory
+
+ # Get the real directory in the repository
+ try:
+ prefix=directory=file('CVS/Repository').read().strip()
+ except IOError:
+ raise cvsps_log_error('Not a CVS sandbox')
+
+ if not prefix.endswith('/'):
+ prefix+='/'
+
+ # Use the Root file in the sandbox, if it exists
+ try:
+ root=file('CVS/Root').read().strip()
+ except IOError:
+ pass
+
+ if not root:
+ root=os.environ.get('CVSROOT',None)
+
+ # read log cache if one exists
+ oldlog=[]
+ date=None
+
+ if cache:
+ cachedir=os.path.expanduser('~/.hg.cvsps')
+ if not os.path.exists(cachedir):
+ os.mkdir(cachedir)
+ cachefile=(root or "").split(":")+[directory,"cache"]
+ cachefile=['-'.join(re.findall(r'\w+',s)) for s in cachefile if s]
+ cachefile=os.path.join(cachedir,'.'.join(cachefile))
+
+ if cache=='update':
+ try:
+ ui.debug(_('reading cvs log cache %s\n')%cachefile)
+ oldlog=pickle.load(file(cachefile))
+ ui.debug(_('cache has %d log entries\n')%len(oldlog))
+ except Exception,e:
+ ui.debug(_('error reading cache: %r\n')%e)
+
+ if oldlog:
+ date=oldlog[-1].Date # last commit date as a (time,tz) tuple
+ date=util.datestr(date,'%Y/%m/%d %H:%M:%S %1%2')
+
+ # build the CVS commandline
+ cmd=['cvs','-q']
+ if root:
+ cmd.append('-d%s'%root)
+ p=root.split(':')[-1]
+ if not p.endswith('/'):
+ p+='/'
+ prefix=p+prefix
+ cmd.append(['log','rlog'][rlog])
+ if date:
+ # no space between option and date string
+ cmd.append('-d>%s'%date)
+ cmd.append(directory)
+
+ # state machine begins here
+ tags={} # dictionary of revisions on current file with their tags
+ state=0
+ store=False # set when a new record can be appended
+
+ cmd=[util.shellquote(arg) for arg in cmd]
+
+ for line in util.popen(' '.join(cmd)):
+ if line.endswith('\n'):
+ line=line[:-1]
+ #ui.debug('state=%d line=%r\n'%(state,line))
+
+ if state==0:
+ match=re_00.match(line)
+ if match:
+ rcs=match.group(1)
+ tags={}
+ if rlog:
+ filename=rcs[:-2]
+ if filename.startswith(prefix):
+ filename=filename[len(prefix):]
+ if filename.startswith('/'):
+ filename=filename[1:]
+ if filename.startswith('Attic/'):
+ filename=filename[6:]
+ else:
+ filename=filename.replace('/Attic/','/')
+ state=2
+ continue
+ state=1
+ continue
+ match=re_01.match(line)
+ if match:
+ raise Exception(match.group(1))
+ match=re_02.match(line)
+ if match:
+ raise Exception(match.group(2))
+ if re_03.match(line):
+ raise Exception(line)
+
+ elif state==1:
+ match=re_10.match(line)
+ assert match,_('RCS file must be followed by working file')
+ filename=match.group(1)
+ state=2
+
+ elif state==2:
+ if re_20.match(line):
+ state=3
+
+ elif state==3:
+ match=re_30.match(line)
+ if match:
+ rev=[int(x) for x in match.group(2).split('.')]
+
+ # Convert magic branch number to an odd-numbered one
+ revn=len(rev)
+ if revn>3 and (revn%2)==0 and rev[-2]==0:
+ rev=rev[:-2]+rev[-1:]
+ rev=tuple(rev)
+
+ if rev not in tags:
+ tags[rev]=[]
+ tags[rev].append(match.group(1))
+
+ elif re_31.match(line):
+ state=5
+ elif re_32.match(line):
+ state=0
+
+ elif state==4:
+ if re_31.match(line):
+ state=5
+ else:
+ assert not re_32.match(line),_('Must have at least some revisions')
+
+ elif state==5:
+ match=re_50.match(line)
+ assert match,_('expected revision number')
+ e=cvsps_log_entry(RCS=scache(rcs),File=scache(filename),
+ Revision=tuple([int(x) for x in match.group(1).split('.')]),
+ Branches=[],Parent=None)
+ state=6
+
+ elif state==6:
+ match=re_60.match(line)
+ assert match,_('revision must be followed by date line')
+ d=match.group(1)
+ if d[2]=='/':
+ # Y2K
+ d='19'+d
+
+ if len(d.split())!=3:
+ # cvs log dates always in GMT
+ d=d+' UTC'
+ e.Date=util.parsedate(d,['%y/%m/%d %H:%M:%S','%Y/%m/%d %H:%M:%S','%Y-%m-%d %H:%M:%S'])
+ e.Author=scache(match.group(2))
+ e.Dead=match.group(3).lower()=='dead'
+
+ if match.group(5):
+ if match.group(6):
+ e.Lines=(int(match.group(5)),int(match.group(6)))
+ else:
+ e.Lines=(int(match.group(5)),0)
+ elif match.group(6):
+ e.Lines=(0,int(match.group(6)))
+ else:
+ e.Lines=None
+ e.Comment=[]
+ state=7
+
+ elif state==7:
+ m=re_70.match(line)
+ if m:
+ e.Branches=[tuple([int(y) for y in x.strip().split('.')])
+ for x in m.group(1).split(';')]
+ state=8
+ elif re_31.match(line):
+ state=5
+ store=True
+ elif re_32.match(line):
+ state=0
+ store=True
+ else:
+ e.Comment.append(line)
+
+ elif state==8:
+ if re_31.match(line):
+ state=5
+ store=True
+ elif re_32.match(line):
+ state=0
+ store=True
+ else:
+ e.Comment.append(line)
+
+ if store:
+ store=False
+ e.Tags=[scache(x) for x in tags.get(e.Revision,[])]
+ e.Tags.sort()
+ e.Comment=scache('\n'.join(e.Comment))
+
+ revn=len(e.Revision)
+ if revn>3 and (revn%2)==0:
+ e.Branch=tags.get(e.Revision[:-1],[None])[0]
+ else:
+ e.Branch=None
+
+ log.append(e)
+
+ if len(log)%100==0:
+ ui.status(util.ellipsis('%d %s'%(len(log),e.File),80)+'\n')
+
+ listsort(log,key=lambda x:(x.RCS,x.Revision))
+
+ # find parent revisions of individual files
+ versions={}
+ for e in log:
+ branch=e.Revision[:-1]
+ p=versions.get((e.RCS,branch),None)
+ if p is None:
+ p=e.Revision[:-2]
+ e.Parent=p
+ versions[(e.RCS,branch)]=e.Revision
+
+ # update the log cache
+ if cache:
+ if log:
+ # join up the old and new logs
+ listsort(log,key=lambda x:x.Date)
+
+ if oldlog and oldlog[-1].Date>=log[0].Date:
+ raise cvsps_log_error('Log cache overlaps with new log entries, re-run without cache.')
+
+ log=oldlog+log
+
+ # write the new cachefile
+ ui.debug(_('writing cvs log cache %s\n')%cachefile)
+ pickle.dump(log,file(cachefile,'w'))
+ else:
+ log=oldlog
+
+ ui.status(_('%d log entries\n')%len(log))
+
+ return log
+
+
+class cvsps_changeset(object):
+ '''Class cvsps_changeset has the following attributes:
+ .Author - author name as CVS knows it
+ .Branch - name of branch this changeset is on, or None
+ .Comment - commit message
+ .Date - the commit date as a (time,tz) tuple
+ .Entries - list of cvsps_log_entry objects in this changeset
+ .Parent - list of one or two parent changesets
+ .Tags - list of tags on this changeset
+ '''
+ def __init__(self,**entries):
+ self.__dict__.update(entries)
+
+def cvsps_create_changeset(ui,log,fuzz=60,mergefrom=None,mergeto=None):
+ '''Convert log into changesets.'''
+
+ ui.status(_('creating changesets\n'))
+
+ # Merge changesets
+
+ listsort(log,key=lambda x:(x.Comment,x.Author,x.Branch,x.Date))
+
+ changeset=[]
+ files={}
+ c=None
+ for i,e in enumerate(log):
+
+ # Check if log entry belongs to the current changeset or not.
+ if not (c and
+ e.Comment==c.Comment and
+ e.Author==c.Author and
+ e.Branch==c.Branch and
+ (c.Date[0]+c.Date[1])<=(e.Date[0]+e.Date[1])<=(c.Date[0]+c.Date[1])+fuzz and
+ e.File not in files):
+ c=cvsps_changeset(Comment=e.Comment,Author=e.Author,
+ Branch=e.Branch,Date=e.Date,Entries=[])
+ changeset.append(c)
+ files={}
+ if len(changeset)%100==0:
+ ui.status(util.ellipsis('%d %s'%(len(changeset),repr(e.Comment)[1:-1]),80)+'\n')
+
+ e.Changeset=c
+ c.Entries.append(e)
+ files[e.File]=True
+ c.Date=e.Date # changeset date is date of latest commit in it
+
+ # Sort files in each changeset
+
+ for c in changeset:
+ def pathcompare(l,r):
+ 'Mimic cvsps sorting order'
+ l=l.split('/')
+ r=r.split('/')
+ nl=len(l)
+ nr=len(r)
+ n=min(nl,nr)
+ for i in range(n):
+ if i+1==nl and nl<nr:
+ return -1
+ elif i+1==nr and nl>nr:
+ return +1
+ elif l[i]<r[i]:
+ return -1
+ elif l[i]>r[i]:
+ return +1
+ return 0
+ def entitycompare(l,r):
+ return pathcompare(l.File,r.File)
+
+ c.Entries.sort(entitycompare)
+
+ # Sort changesets by date
+
+ def cscmp(l,r):
+ d=sum(l.Date)-sum(r.Date)
+ if d:
+ return d
+
+ # detect vendor branches and initial commits on a branch
+ le={}
+ for e in l.Entries:
+ le[e.RCS]=e.Revision
+ re={}
+ for e in r.Entries:
+ re[e.RCS]=e.Revision
+
+ d=0
+ for e in l.Entries:
+ if re.get(e.RCS,None)==e.Parent:
+ assert not d
+ d=1
+ break
+
+ for e in r.Entries:
+ if le.get(e.RCS,None)==e.Parent:
+ assert not d
+ d=-1
+ break
+
+ return d
+
+ changeset.sort(cscmp)
+
+ # Collect tags
+
+ globaltags={}
+ for c in changeset:
+ tags={}
+ for e in c.Entries:
+ for tag in e.Tags:
+ # remember which is the latest changeset to have this tag
+ globaltags[tag]=c
+
+ for c in changeset:
+ tags={}
+ for e in c.Entries:
+ for tag in e.Tags:
+ tags[tag]=True
+ # remember tags only if this is the latest changeset to have it
+ tagnames=[tag for tag in tags if globaltags[tag] is c]
+ tagnames.sort()
+ c.Tags=tagnames
+
+ # Find parent changesets, handle {{mergetobranch BRANCHNAME}}
+ # by inserting dummy changesets with two parents, and handle
+ # {{mergefrombranch BRANCHNAME}} by setting two parents.
+
+ if mergeto is None:
+ mergeto=r'{{mergetobranch ([-\w]+)}}'
+ if mergeto:
+ mergeto=re.compile(mergeto)
+
+ if mergefrom is None:
+ mergefrom=r'{{mergefrombranch ([-\w]+)}}'
+ if mergefrom:
+ mergefrom=re.compile(mergefrom)
+
+ versions={} # changeset index where we saw any particular file version
+ branches={} # changeset index where we saw a branch
+ n=len(changeset)
+ i=0
+ while i<n:
+ c=changeset[i]
+
+ for f in c.Entries:
+ versions[(f.RCS,f.Revision)]=i
+
+ p=None
+ if c.Branch in branches:
+ p=branches[c.Branch]
+ else:
+ for f in c.Entries:
+ p=max(p,versions.get((f.RCS,f.Parent),None))
+
+ c.Parents=[]
+ if p is not None:
+ c.Parents.append(changeset[p])
+
+ if mergefrom:
+ m=mergefrom.search(c.Comment)
+ if m:
+ m=m.group(1)
+ if m=='HEAD':
+ m=None
+ if m in branches and c.Branch!=m:
+ c.Parents.append(changeset[branches[m]])
+
+ if mergeto:
+ m=mergeto.search(c.Comment)
+ if m:
+ try:
+ m=m.group(1)
+ if m=='HEAD':
+ m=None
+ except:
+ m=None # if no group found then merge to HEAD
+ if m in branches and c.Branch!=m:
+ # insert empty changeset for merge
+ cc=cvsps_changeset(Author=c.Author,Branch=m,Date=c.Date,
+ Comment='convert-repo: CVS merge from branch %s'%c.Branch,
+ Entries=[],Tags=[],Parents=[changeset[branches[m]],c])
+ changeset.insert(i+1,cc)
+ branches[m]=i+1
+
+ # adjust our loop counters now we have inserted a new entry
+ n+=1
+ i+=2
+ continue
+
+ branches[c.Branch]=i
+ i+=1
+
+ # Number changesets
+
+ for i,c in enumerate(changeset):
+ c.Id=i+1
+
+ ui.status(_('%d changeset entries\n')%len(changeset))
+
+ return changeset
+
+# EOF cvsps.py
More information about the Mercurial-devel
mailing list