Source code for smith_utils.file.classification

"""Utilities for classifying file types from multiple signals."""

from __future__ import annotations

from dataclasses import dataclass
import mimetypes
from pathlib import Path
import shutil
import subprocess


_MAGIC_SIGNATURES = [
    (b"%PDF-", "PDF document", "application/pdf", {"document", "pdf"}),
    (b"\x89PNG\r\n\x1a\n", "PNG image", "image/png", {"image", "png"}),
    (b"\xff\xd8\xff", "JPEG image", "image/jpeg", {"image", "jpeg"}),
    (b"GIF87a", "GIF image", "image/gif", {"image", "gif"}),
    (b"GIF89a", "GIF image", "image/gif", {"image", "gif"}),
    (b"PK\x03\x04", "ZIP archive", "application/zip", {"archive", "zip"}),
    (b"PK\x05\x06", "ZIP archive", "application/zip", {"archive", "zip"}),
    (b"PK\x07\x08", "ZIP archive", "application/zip", {"archive", "zip"}),
    (b"\x1f\x8b", "gzip compressed data", "application/gzip", {"archive", "gzip"}),
    (b"BZh", "bzip2 compressed data", "application/x-bzip2", {"archive", "bzip2"}),
    (b"7z\xbc\xaf\x27\x1c", "7-zip archive", "application/x-7z-compressed", {"archive", "7z"}),
    (b"SQLite format 3\x00", "SQLite database", "application/vnd.sqlite3", {"database", "sqlite"}),
    (b"\x7fELF", "ELF executable", "application/x-elf", {"binary", "executable", "elf"}),
    (b"\xca\xfe\xba\xbe", "Java class or Mach-O universal binary", "application/octet-stream", {"binary"}),
    (b"\xfe\xed\xfa\xce", "Mach-O executable", "application/x-mach-binary", {"binary", "executable", "mach-o"}),
    (b"\xfe\xed\xfa\xcf", "Mach-O executable", "application/x-mach-binary", {"binary", "executable", "mach-o"}),
    (b"\xce\xfa\xed\xfe", "Mach-O executable", "application/x-mach-binary", {"binary", "executable", "mach-o"}),
    (b"\xcf\xfa\xed\xfe", "Mach-O executable", "application/x-mach-binary", {"binary", "executable", "mach-o"}),
]


_MIME_CATEGORIES = {
    "application/gzip": {"archive", "gzip"},
    "application/pdf": {"document", "pdf"},
    "application/vnd.sqlite3": {"database", "sqlite"},
    "application/x-7z-compressed": {"archive", "7z"},
    "application/x-bzip2": {"archive", "bzip2"},
    "application/x-elf": {"binary", "executable", "elf"},
    "application/x-mach-binary": {"binary", "executable", "mach-o"},
    "application/zip": {"archive", "zip"},
    "text/csv": {"text", "csv"},
}


_EXTENSION_CATEGORIES = {
    ".7z": {"archive", "7z"},
    ".bz2": {"archive", "bzip2"},
    ".csv": {"text", "csv"},
    ".db": {"database"},
    ".gif": {"image", "gif"},
    ".gz": {"archive", "gzip"},
    ".jpeg": {"image", "jpeg"},
    ".jpg": {"image", "jpeg"},
    ".json": {"text", "json"},
    ".md": {"text", "markdown"},
    ".pdf": {"document", "pdf"},
    ".png": {"image", "png"},
    ".sqlite": {"database", "sqlite"},
    ".sqlite3": {"database", "sqlite"},
    ".txt": {"text"},
    ".zip": {"archive", "zip"},
}


_CLASS_PRIORITY = (
    "document",
    "image",
    "video",
    "audio",
    "archive",
    "database",
    "text",
    "binary",
    "font",
)



[docs]
@dataclass(frozen=True)
class FileClassification:
    """Classification evidence for a file path."""

    path: Path
    extension: str
    file_description: str | None
    file_mime_type: str | None
    extension_mime_type: str | None
    magic_type: str | None
    magic_mime_type: str | None
    file_class: str | None
    categories: tuple[str, ...]




[docs]
def classify_file(path: str | Path, *, sample_size: int = 4096) -> FileClassification:
    """Classify a file using extension, magic bytes, MIME, and ``file(1)`` signals."""

    file_path = Path(path)
    extension = file_path.suffix.lower()
    header = _read_header(file_path, sample_size)
    magic_type, magic_mime_type, magic_categories = _classify_magic(header)
    file_description = _run_file_command(file_path, "--brief")
    file_mime_type = _run_file_command(file_path, "--brief", "--mime-type")
    extension_mime_type = mimetypes.guess_type(file_path.name)[0]

    categories = set()
    categories.update(_EXTENSION_CATEGORIES.get(extension, set()))
    categories.update(magic_categories)
    categories.update(_categories_for_mime(extension_mime_type))
    categories.update(_categories_for_mime(file_mime_type))
    categories.update(_categories_for_mime(magic_mime_type))

    sorted_categories = tuple(sorted(categories))

    return FileClassification(
        path=file_path,
        extension=extension,
        file_description=file_description,
        file_mime_type=file_mime_type,
        extension_mime_type=extension_mime_type,
        magic_type=magic_type,
        magic_mime_type=magic_mime_type,
        file_class=_select_file_class(sorted_categories),
        categories=sorted_categories,
    )



def _read_header(path: Path, sample_size: int) -> bytes:
    if sample_size < 1:
        raise ValueError("sample_size must be greater than zero")

    with path.open("rb") as file:
        return file.read(sample_size)


def _classify_magic(header: bytes) -> tuple[str | None, str | None, set[str]]:
    for signature, file_type, mime_type, categories in _MAGIC_SIGNATURES:
        if header.startswith(signature):
            return file_type, mime_type, set(categories)
    return None, None, set()


def _categories_for_mime(mime_type: str | None) -> set[str]:
    if not mime_type:
        return set()

    categories = set(_MIME_CATEGORIES.get(mime_type, set()))
    root_type = mime_type.split("/", 1)[0]
    if root_type in {"audio", "font", "image", "text", "video"}:
        categories.add(root_type)
    return categories


def _select_file_class(categories: tuple[str, ...]) -> str | None:
    category_set = set(categories)
    for file_class in _CLASS_PRIORITY:
        if file_class in category_set:
            return file_class
    return categories[0] if categories else None


def _run_file_command(path: Path, *args: str) -> str | None:
    if shutil.which("file") is None:
        return None

    try:
        result = subprocess.run(
            ["file", *args, str(path)],
            capture_output=True,
            check=True,
            text=True,
        )
    except (OSError, subprocess.CalledProcessError):
        return None

    output = result.stdout.strip()
    return output or None