[PATCH v2] lazymanifest: write a more efficient, pypy friendly version of lazymanifest
Augie Fackler
raf at durin42.com
Mon Sep 19 19:43:16 EDT 2016
> On Sep 17, 2016, at 14:23, Maciej Fijalkowski <fijall at gmail.com> wrote:
>
> This should fix the issues presented.
>
> There is one problem which is that the hash in test-rebase-detach
> changes. The way I see it is just an RNG phase-shift, but it might be
> a real bug. Some actual input will be appreciated.
I'll get a look at this in the morning America/New_York - my other miscellany occupied me for far too long today and I just don't have brainpower for this tonight. Sorry to keep you waiting.
AF
>
>
>
> On Sat, Sep 17, 2016 at 8:22 PM, Maciej Fijalkowski <fijall at gmail.com> wrote:
>> # HG changeset patch
>> # User Maciej Fijalkowski <fijall at gmail.com>
>> # Date 1473680234 -7200
>> # Mon Sep 12 13:37:14 2016 +0200
>> # Node ID 7551f1e60b2155462d89a9571eec065e9f67debc
>> # Parent df05c43bd1e64f1620d0b2e502f4603c1e5a8341
>> lazymanifest: write a more efficient, pypy friendly version of lazymanifest
>>
>> diff --git a/mercurial/manifest.py b/mercurial/manifest.py
>> --- a/mercurial/manifest.py
>> +++ b/mercurial/manifest.py
>> @@ -104,69 +104,297 @@
>> _checkforbidden(files)
>> return ''.join(lines)
>>
>> -class _lazymanifest(dict):
>> - """This is the pure implementation of lazymanifest.
>> -
>> - It has not been optimized *at all* and is not lazy.
>> - """
>> -
>> - def __init__(self, data):
>> - dict.__init__(self)
>> - for f, n, fl in _parse(data):
>> - self[f] = n, fl
>> -
>> - def __setitem__(self, k, v):
>> - node, flag = v
>> - assert node is not None
>> - if len(node) > 21:
>> - node = node[:21] # match c implementation behavior
>> - dict.__setitem__(self, k, (node, flag))
>> +class lazymanifestiter(object):
>> + def __init__(self, lm):
>> + self.pos = 0
>> + self.lm = lm
>>
>> def __iter__(self):
>> - return iter(sorted(dict.keys(self)))
>> + return self
>>
>> - def iterkeys(self):
>> - return iter(sorted(dict.keys(self)))
>> + def next(self):
>> + try:
>> + data, pos = self.lm._get(self.pos)
>> + except IndexError:
>> + raise StopIteration
>> + if pos == -1:
>> + self.pos += 1
>> + return data[0]
>> + self.pos += 1
>> + zeropos = data.find('\x00', pos)
>> + return data[pos:zeropos]
>>
>> - def iterentries(self):
>> - return ((f, e[0], e[1]) for f, e in sorted(self.iteritems()))
>> +class lazymanifestiterentries(object):
>> + def __init__(self, lm):
>> + self.lm = lm
>> + self.pos = 0
>> +
>> + def __iter__(self):
>> + return self
>> +
>> + def next(self):
>> + try:
>> + data, pos = self.lm._get(self.pos)
>> + except IndexError:
>> + raise StopIteration
>> + if pos == -1:
>> + self.pos += 1
>> + return data
>> + zeropos = data.find('\x00', pos)
>> + hashval = unhexlify(data, self.lm.extrainfo[self.pos],
>> + zeropos + 1, 40)
>> + flags = self.lm._getflags(data, self.pos, zeropos)
>> + self.pos += 1
>> + return (data[pos:zeropos], hashval, flags)
>> +
>> +def unhexlify(data, extra, pos, length):
>> + s = data[pos:pos + length].decode('hex')
>> + if extra:
>> + s += chr(extra & 0xff)
>> + return s
>> +
>> +def _cmp(a, b):
>> + return (a > b) - (a < b)
>> +
>> +class _lazymanifest(object):
>> + def __init__(self, data, positions=None, extrainfo=None, extradata=None):
>> + if positions is None:
>> + self.positions = self.findlines(data)
>> + self.extrainfo = [0] * len(self.positions)
>> + self.data = data
>> + self.extradata = []
>> + else:
>> + self.positions = positions[:]
>> + self.extrainfo = extrainfo[:]
>> + self.extradata = extradata[:]
>> + self.data = data
>> +
>> + def findlines(self, data):
>> + if not data:
>> + return []
>> + pos = data.find("\n")
>> + positions = [0]
>> + while pos < len(data) - 1 and pos != -1:
>> + positions.append(pos + 1)
>> + pos = data.find("\n", pos + 1)
>> + return positions
>> +
>> + def _get(self, index):
>> + # get the position encoded in pos:
>> + # positive number is an index in 'data'
>> + # negative number is in extrapieces
>> + pos = self.positions[index]
>> + if pos >= 0:
>> + return self.data, pos
>> + return self.extradata[-pos - 1], -1
>> +
>> + def _getkey(self, pos):
>> + if pos >= 0:
>> + return self.data[pos:self.data.find('\x00', pos + 1)]
>> + return self.extradata[-pos - 1][0]
>> +
>> + def bsearch(self, key):
>> + first = 0
>> + last = len(self.positions) - 1
>> +
>> + while first <= last:
>> + midpoint = (first + last)//2
>> + nextpos = self.positions[midpoint]
>> + candidate = self._getkey(nextpos)
>> + r = _cmp(key, candidate)
>> + if r == 0:
>> + return midpoint
>> + else:
>> + if r < 0:
>> + last = midpoint - 1
>> + else:
>> + first = midpoint + 1
>> + return -1
>> +
>> + def bsearch2(self, key):
>> + # same as the above, but will always return the position
>> + # done for performance reasons
>> + first = 0
>> + last = len(self.positions) - 1
>> +
>> + while first <= last:
>> + midpoint = (first + last)//2
>> + nextpos = self.positions[midpoint]
>> + candidate = self._getkey(nextpos)
>> + r = _cmp(key, candidate)
>> + if r == 0:
>> + return (midpoint, True)
>> + else:
>> + if r < 0:
>> + last = midpoint - 1
>> + else:
>> + first = midpoint + 1
>> + return (first, False)
>> +
>> + def __contains__(self, key):
>> + return self.bsearch(key) != -1
>> +
>> + def _getflags(self, data, needle, pos):
>> + start = pos + 41
>> + end = data.find("\n", start)
>> + if end == -1:
>> + end = len(data) - 1
>> + if start == end:
>> + return ''
>> + return self.data[start:end]
>> +
>> + def __getitem__(self, key):
>> + if not isinstance(key, str):
>> + raise TypeError("getitem: manifest keys must be a string.")
>> + needle = self.bsearch(key)
>> + if needle == -1:
>> + raise KeyError
>> + data, pos = self._get(needle)
>> + if pos == -1:
>> + return (data[1], data[2])
>> + zeropos = data.find('\x00', pos)
>> + assert 0 <= needle <= len(self.positions)
>> + assert len(self.extrainfo) == len(self.positions)
>> + hashval = unhexlify(data, self.extrainfo[needle], zeropos + 1, 40)
>> + flags = self._getflags(data, needle, zeropos)
>> + return (hashval, flags)
>> +
>> + def __delitem__(self, key):
>> + needle, found = self.bsearch2(key)
>> + if not found:
>> + raise KeyError
>> + cur = self.positions[needle]
>> + self.positions = self.positions[:needle] + self.positions[needle + 1:]
>> + self.extrainfo = self.extrainfo[:needle] + self.extrainfo[needle + 1:]
>> + if cur >= 0:
>> + self.data = self.data[:cur] + '\x00' + self.data[cur + 1:]
>> +
>> + def __setitem__(self, key, value):
>> + if not isinstance(key, str):
>> + raise TypeError("setitem: manifest keys must be a string.")
>> + if not isinstance(value, tuple) or len(value) != 2:
>> + raise TypeError("Manifest values must be a tuple of (node, flags).")
>> + hashval = value[0]
>> + if not isinstance(hashval, str) or not 20 <= len(hashval) <= 22:
>> + raise TypeError("node must be a 20-byte string")
>> + flags = value[1]
>> + if not isinstance(flags, str) or len(flags) > 1:
>> + raise TypeError("flags must a 0 or 1 byte string, got %r", flags)
>> + needle, found = self.bsearch2(key)
>> + if found:
>> + # put the item
>> + pos = self.positions[needle]
>> + if pos < 0:
>> + self.extradata[-pos - 1] = (key, value[0], value[1])
>> + else:
>> + # just don't bother
>> + self.extradata.append((key, value[0], value[1]))
>> + self.positions[needle] = -len(self.extradata)
>> + else:
>> + # not found, put it in with extra positions
>> + self.extradata.append((key, value[0], value[1]))
>> + self.positions = (self.positions[:needle] + [-len(self.extradata)]
>> + + self.positions[needle:])
>> + self.extrainfo = (self.extrainfo[:needle] + [0] +
>> + self.extrainfo[needle:])
>>
>> def copy(self):
>> - c = _lazymanifest('')
>> - c.update(self)
>> - return c
>> + # XXX call _compact like in C?
>> + return _lazymanifest(self.data, self.positions, self.extrainfo,
>> + self.extradata)
>> +
>> + def _compact(self):
>> + # hopefully not called TOO often
>> + def findnextposition(positions, start, lgt):
>> + i = start
>> + while i < len(positions):
>> + if positions[i] >= 0:
>> + return positions[i]
>> + i += 1
>> + return lgt
>> +
>> + if len(self.extradata) == 0:
>> + return
>> + l = []
>> + last_cut = 0
>> + i = 0
>> + offset = 0
>> + self.extrainfo = [0] * len(self.positions)
>> + while i < len(self.positions):
>> + if self.positions[i] >= 0:
>> + cur = self.positions[i]
>> + last_cut = cur
>> + while True:
>> + self.positions[i] = offset
>> + i += 1
>> + if i == len(self.positions) or self.positions[i] < 0:
>> + break
>> + offset += self.positions[i] - cur
>> + cur = self.positions[i]
>> + end_cut = findnextposition(self.positions, i, len(self.data))
>> + offset += end_cut - cur
>> + l.append(self.data[last_cut:end_cut])
>> + else:
>> + while i < len(self.positions) and self.positions[i] < 0:
>> + cur = self.positions[i]
>> + t = self.extradata[-cur - 1]
>> + l.append(self._pack(t))
>> + self.positions[i] = offset
>> + if len(t[1]) > 20:
>> + self.extrainfo[i] = ord(t[1][21])
>> + offset += len(l[-1])
>> + i += 1
>> + self.data = ''.join(l)
>> + self.extradata = []
>> +
>> + def _pack(self, d):
>> + return d[0] + '\x00' + d[1][:20].encode('hex') + d[2] + '\n'
>> +
>> + def text(self):
>> + self._compact()
>> + return self.data
>>
>> def diff(self, m2, clean=False):
>> '''Finds changes between the current manifest and m2.'''
>> + # XXX think whether efficiency matters here
>> diff = {}
>>
>> - for fn, e1 in self.iteritems():
>> + for fn, e1, flags in self.iterentries():
>> if fn not in m2:
>> - diff[fn] = e1, (None, '')
>> + diff[fn] = (e1, flags), (None, '')
>> else:
>> e2 = m2[fn]
>> - if e1 != e2:
>> - diff[fn] = e1, e2
>> + if (e1, flags) != e2:
>> + diff[fn] = (e1, flags), e2
>> elif clean:
>> diff[fn] = None
>>
>> - for fn, e2 in m2.iteritems():
>> + for fn, e2, flags in m2.iterentries():
>> if fn not in self:
>> - diff[fn] = (None, ''), e2
>> + diff[fn] = (None, ''), (e2, flags)
>>
>> return diff
>>
>> + def iterentries(self):
>> + return lazymanifestiterentries(self)
>> +
>> + def iterkeys(self):
>> + return lazymanifestiter(self)
>> +
>> + def __iter__(self):
>> + return lazymanifestiter(self)
>> +
>> + def __len__(self):
>> + return len(self.positions)
>> +
>> def filtercopy(self, filterfn):
>> + # XXX should be optimized
>> c = _lazymanifest('')
>> for f, n, fl in self.iterentries():
>> if filterfn(f):
>> c[f] = n, fl
>> return c
>>
>> - def text(self):
>> - """Get the full data of this manifest as a bytestring."""
>> - return _textv1(self.iterentries())
>> -
>> try:
>> _lazymanifest = parsers.lazymanifest
>> except AttributeError:
>> diff --git a/mercurial/pure/bdiff.py b/mercurial/pure/bdiff.py
>> --- a/mercurial/pure/bdiff.py
>> +++ b/mercurial/pure/bdiff.py
>> @@ -111,8 +111,8 @@
>> def blocks(sa, sb):
>> a = ffi.new("struct bdiff_line**")
>> b = ffi.new("struct bdiff_line**")
>> - ac = ffi.new("char[]", sa)
>> - bc = ffi.new("char[]", sb)
>> + ac = ffi.new("char[]", str(sa))
>> + bc = ffi.new("char[]", str(sb))
>> l = ffi.new("struct bdiff_hunk*")
>> try:
>> an = lib.bdiff_splitlines(ac, len(sa), a)
>> @@ -138,8 +138,8 @@
>> def bdiff(sa, sb):
>> a = ffi.new("struct bdiff_line**")
>> b = ffi.new("struct bdiff_line**")
>> - ac = ffi.new("char[]", sa)
>> - bc = ffi.new("char[]", sb)
>> + ac = ffi.new("char[]", str(sa))
>> + bc = ffi.new("char[]", str(sb))
>> l = ffi.new("struct bdiff_hunk*")
>> try:
>> an = lib.bdiff_splitlines(ac, len(sa), a)
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
More information about the Mercurial-devel
mailing list