Implement independent directory tree hashing without tar
This patch implements the os.walk() based directory hashing mentioned
in 3fb80a43
. Rough benchmarking shows that while it's not terrible
performance wise, the GNU tar based solution almost always outperforms
it. Therefore it is used as a fallback implementation in case GNU tar is
not found when trying to generate an identicon from a directory tree.
This commit is contained in:
parent
3179be1f76
commit
bde2b0067e
44
main.py
44
main.py
@ -1,16 +1,13 @@
|
||||
#!/usr/bin/env python3
|
||||
# pylint: disable=consider-using-with
|
||||
# (this code contains some IO stream juggling)
|
||||
from sys import stdin
|
||||
from sys import exit as sysexit
|
||||
from io import BytesIO
|
||||
from subprocess import Popen, PIPE
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
from blake3 import blake3
|
||||
|
||||
from identicon import Identicon
|
||||
from stream import get_deterministic_stream, ClosableStream
|
||||
|
||||
|
||||
DIGEST_SIZE = 20
|
||||
@ -52,49 +49,12 @@ def get_input_stream(kwargs):
|
||||
if (text := kwargs['text']) is not None:
|
||||
stream = ClosableStream(BytesIO(text.encode()))
|
||||
elif file := kwargs['file']:
|
||||
stream = get_deterministic_stream(file)
|
||||
stream = get_deterministic_stream(file, BUF_SIZE)
|
||||
elif not stdin.isatty():
|
||||
stream = ClosableStream(stdin.buffer)
|
||||
return stream
|
||||
|
||||
|
||||
class ClosableStream:
|
||||
def __init__(self, stream, close_func=None):
|
||||
self.stream = stream
|
||||
self._close_func = close_func or (lambda: None)
|
||||
|
||||
def close(self):
|
||||
return self._close_func()
|
||||
|
||||
|
||||
def get_deterministic_stream(file):
|
||||
if Path(file).is_dir():
|
||||
return get_deterministic_tar_stream(file)
|
||||
|
||||
ifile = open(file, 'rb')
|
||||
return ClosableStream(ifile, ifile.close)
|
||||
|
||||
|
||||
def get_deterministic_tar_stream(file):
|
||||
cmd = (
|
||||
'tar',
|
||||
f'--blocking-factor={BUF_SIZE//512}',
|
||||
'--sort=name',
|
||||
'--mtime=UTC 1970-01-01',
|
||||
'--owner=root:0', '--group=root:0', '--numeric-owner',
|
||||
'--mode=a=rwX', '--no-acls', '--no-xattrs', '--no-selinux',
|
||||
'-C', file, '-cf', '-', '.'
|
||||
)
|
||||
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
|
||||
|
||||
def wait_and_check_exitcode():
|
||||
exit_code = p.wait()
|
||||
if exit_code != 0:
|
||||
raise RuntimeError(f'Tar failed: {p.stderr.read().decode()}')
|
||||
|
||||
return ClosableStream(p.stdout, wait_and_check_exitcode)
|
||||
|
||||
|
||||
def print_usage_and_exit():
|
||||
command = main
|
||||
with click.Context(command) as ctx:
|
||||
|
124
stream.py
Normal file
124
stream.py
Normal file
@ -0,0 +1,124 @@
|
||||
# pylint: disable=consider-using-with
|
||||
# (this code contains some IO stream juggling)
|
||||
from subprocess import Popen, PIPE, run, STDOUT
|
||||
from os import walk
|
||||
from os.path import abspath, relpath, join
|
||||
from pathlib import Path
|
||||
from sys import stderr
|
||||
|
||||
|
||||
class ClosableStream:
|
||||
def __init__(self, stream, close_func=None):
|
||||
self.stream = stream
|
||||
self._close_func = close_func or (lambda: None)
|
||||
|
||||
def close(self):
|
||||
return self._close_func()
|
||||
|
||||
|
||||
def get_deterministic_stream(path, buffer_size):
|
||||
if Path(path).is_dir():
|
||||
return _get_directorytree_stream(path, buffer_size)
|
||||
|
||||
ifile = open(path, 'rb')
|
||||
return ClosableStream(ifile, ifile.close)
|
||||
|
||||
|
||||
def _get_directorytree_stream(path, buffer_size):
|
||||
if _gnu_tar_available():
|
||||
return TarDirectorytreeStream(path, buffer_size)
|
||||
print(
|
||||
"GNU tar not found, falling back to less performant Python implementation.",
|
||||
file=stderr
|
||||
)
|
||||
return WalkDirectorytreeStream(path)
|
||||
|
||||
|
||||
def _gnu_tar_available():
|
||||
proc = run(('tar', '--version'), stdout=PIPE, stderr=STDOUT, check=False)
|
||||
if proc.returncode != 0:
|
||||
return False
|
||||
return 'GNU tar' in proc.stdout.decode()
|
||||
|
||||
|
||||
class TarDirectorytreeStream:
|
||||
def __init__(self, path, buffer_size):
|
||||
self._path = path
|
||||
self._buffer_size = buffer_size
|
||||
self._process = None
|
||||
|
||||
@property
|
||||
def stream(self):
|
||||
cmd = (
|
||||
'tar',
|
||||
f'--blocking-factor={self._buffer_size//512}',
|
||||
'--sort=name',
|
||||
'--mtime=UTC 1970-01-01',
|
||||
'--owner=root:0', '--group=root:0', '--numeric-owner',
|
||||
'--mode=a=rwX', '--no-acls', '--no-xattrs', '--no-selinux',
|
||||
'-C', self._path, '-cf', '-', '.'
|
||||
)
|
||||
self._process = Popen(cmd, stdout=PIPE, stderr=PIPE)
|
||||
|
||||
return self._process.stdout
|
||||
|
||||
def close(self):
|
||||
if self._process is not None:
|
||||
exit_code = self._process.wait()
|
||||
if exit_code != 0:
|
||||
raise RuntimeError(f'Tar failed: {self._process.stderr.read().decode()}')
|
||||
|
||||
|
||||
class WalkDirectorytreeStream:
|
||||
def __init__(self, path):
|
||||
self._path = path
|
||||
self.stream = self
|
||||
self._stream_generator = None
|
||||
|
||||
def read(self, buffer_size):
|
||||
if self._stream_generator is None:
|
||||
self._stream_generator = self._stream_all_files(buffer_size)
|
||||
return next(self._stream_generator, None)
|
||||
|
||||
def _stream_all_files(self, buffer_size):
|
||||
"""
|
||||
This method makes sure that the yielded bytestreams between
|
||||
different invocations are equvivalent if and only if the
|
||||
input directory trees are equal in structure and contents.
|
||||
|
||||
Problematic edge cases include:
|
||||
- Different directory trees resulting in the same walk
|
||||
(i.e. yielding the same files in the same order)
|
||||
- Directory trees where files are split/merged
|
||||
Solutions:
|
||||
- We ensure a deterministic walk order by sorting the
|
||||
filenames/dirnames
|
||||
- We yield the relative path[1] of every file before yielding
|
||||
their contents. These filenames will end up in a hash function
|
||||
ensuring that different directory structures produce different
|
||||
fingerprints
|
||||
|
||||
[1]: relative to the directory being streamed
|
||||
|
||||
Note that we use os.path instead of pathlib because:
|
||||
- Path.resolve() reads the filesystem instead of just
|
||||
manipulating strings like abspath and friends do
|
||||
(which makes os.path a lot faster as it tends to avoid disk IO)
|
||||
- pathlib has no relpath
|
||||
"""
|
||||
|
||||
for dirpath, dirnames, filenames in walk(self._path, topdown=True):
|
||||
dirnames.sort()
|
||||
filenames.sort()
|
||||
for filename in filenames:
|
||||
filepath = join(dirpath, filename)
|
||||
yield self._get_relative_path(filepath).encode()
|
||||
with open(filepath, 'rb') as ifile:
|
||||
while data := ifile.read(buffer_size):
|
||||
yield data
|
||||
|
||||
def _get_relative_path(self, filepath):
|
||||
return relpath(abspath(filepath), start=abspath(self._path))
|
||||
|
||||
def close(self):
|
||||
pass
|
Loading…
Reference in New Issue
Block a user