identicon/identicon/stream.py

125 lines
4.1 KiB
Python

# pylint: disable=consider-using-with
# (this code contains some IO stream juggling)
from subprocess import Popen, PIPE, run, STDOUT
from os import walk
from os.path import abspath, relpath, join
from pathlib import Path
from sys import stderr
class ClosableStream:
def __init__(self, stream, close_func=None):
self.stream = stream
self._close_func = close_func or (lambda: None)
def close(self):
return self._close_func()
def get_deterministic_stream(path, buffer_size):
if Path(path).is_dir():
return _get_directorytree_stream(path, buffer_size)
ifile = open(path, 'rb')
return ClosableStream(ifile, ifile.close)
def _get_directorytree_stream(path, buffer_size):
if _gnu_tar_available():
return TarDirectorytreeStream(path, buffer_size)
print(
"GNU tar not found, falling back to less performant Python implementation.",
file=stderr
)
return WalkDirectorytreeStream(path)
def _gnu_tar_available():
proc = run(('tar', '--version'), stdout=PIPE, stderr=STDOUT, check=False)
if proc.returncode != 0:
return False
return 'GNU tar' in proc.stdout.decode()
class TarDirectorytreeStream:
def __init__(self, path, buffer_size):
self._path = path
self._buffer_size = buffer_size
self._process = None
@property
def stream(self):
cmd = (
'tar',
f'--blocking-factor={self._buffer_size//512}',
'--sort=name',
'--mtime=UTC 1970-01-01',
'--owner=root:0', '--group=root:0', '--numeric-owner',
'--mode=a=rwX', '--no-acls', '--no-xattrs', '--no-selinux',
'-C', self._path, '-cf', '-', '.'
)
self._process = Popen(cmd, stdout=PIPE, stderr=PIPE)
return self._process.stdout
def close(self):
if self._process is not None:
exit_code = self._process.wait()
if exit_code != 0:
raise RuntimeError(f'Tar failed: {self._process.stderr.read().decode()}')
class WalkDirectorytreeStream:
def __init__(self, path):
self._path = path
self.stream = self
self._stream_generator = None
def read(self, buffer_size):
if self._stream_generator is None:
self._stream_generator = self._stream_all_files(buffer_size)
return next(self._stream_generator, None)
def _stream_all_files(self, buffer_size):
"""
This method makes sure that the yielded bytestreams between
different invocations are equvivalent if and only if the
input directory trees are equal in structure and contents.
Problematic edge cases include:
- Different directory trees resulting in the same walk
(i.e. yielding the same files in the same order)
- Directory trees where files are split/merged
Solutions:
- We ensure a deterministic walk order by sorting the
filenames/dirnames
- We yield the relative path[1] of every file before yielding
their contents. These filenames will end up in a hash function
ensuring that different directory structures produce different
fingerprints
[1]: relative to the directory being streamed
Note that we use os.path instead of pathlib because:
- Path.resolve() reads the filesystem instead of just
manipulating strings like abspath and friends do
(which makes os.path a lot faster as it tends to avoid disk IO)
- pathlib has no relpath
"""
for dirpath, dirnames, filenames in walk(self._path, topdown=True):
dirnames.sort()
filenames.sort()
for filename in filenames:
filepath = join(dirpath, filename)
yield self._get_relative_path(filepath).encode()
with open(filepath, 'rb') as ifile:
while data := ifile.read(buffer_size):
yield data
def _get_relative_path(self, filepath):
return relpath(abspath(filepath), start=abspath(self._path))
def close(self):
pass