Source code for chicky.core

import datetime
import json
import os
from hashlib import blake2b
from pathlib import Path



[docs]
def checksum(filepath, digest_size=20):
    """
    Checksum a file in an efficient way for large files with blake2b.

    Borrowed from: https://stackoverflow.com/a/44873382

    .. Todo::
        Once minimal Python version support move to 3.11 we should be able to simplify
        this code with ``hashlib.file_digest()`` usage instead.

    Arguments:
        filepath (pathlib.Path): File path to open and checksum.

    Keyword Arguments:
        digest_size (integer): Maximum size of hash digest. This is the size of hash in
            bytes, the returned string will be longer since it is the hexadecimal
            digest. Commonly if you want a string of 10 characters you should ask for a
            digest of ``20``.

    Returns:
        string: The file checksum.
    """
    h = blake2b(digest_size=digest_size)
    b = bytearray(128 * 1024)
    mv = memoryview(b)

    with open(filepath, "rb", buffering=0) as f:
        for n in iter(lambda: f.readinto(mv), 0):
            h.update(mv[:n])

    return h.hexdigest()




[docs]
def collect_files(dirpath, extensions=None, dir_leads=None, filename_leads=None):
    """
    Recursively collect every file from a directory and compute their checksum.

    Arguments:
        dirpath (string or pathlib.Path):

    Keyword Arguments:
        extensions (list): List of allowed file extensions. When it is not empty, each
            that does not match any of those extensions will be ignored.
        dir_leads (list): A list of leading patterns to check on paths, each
            path starting with one of those patterns will be ignored. Match is performed
            against the relative file path (from the ``dirpath``).
        filename_leads (list): A list of leading patterns to check on filenames,
            each filename starting with one of those patterns will be ignored.

    Returns:
        Generator: Yield tuples of path + checksum string
    """
    extensions = tuple("." + v for v in extensions) if extensions else None
    dir_leads = tuple(dir_leads) if dir_leads else None
    filename_leads = tuple(filename_leads) if filename_leads else None

    for root, dirs, files in os.walk(dirpath):
        for item in files:
            rel = os.path.relpath(root, start=dirpath)
            # Only collect item that pass filters
            if (
                (not dir_leads or not rel.startswith(dir_leads)) and
                (not filename_leads or not item.startswith(filename_leads)) and
                (not extensions or item.endswith(extensions))
            ):
                path = Path(os.path.join(root, item))
                yield (path.relative_to(dirpath), checksum(path))




[docs]
def formatter(files, args):
    """
    Format file data.

    Arguments:
        args (object):

    Returns:
        string:
    """
    if args.format == "text":
        return "\n".join([
            str(path) + "\t" + checksum
            for path, checksum in files
        ])
    else:
        store = {
            "created": datetime.datetime.now().isoformat(timespec="seconds"),
            "basedir": str(args.source),
            "extensions": args.ext,
            "files": {
                str(path): checksum
                for path, checksum in files
            },
        }
        return json.dumps(store, indent=4)