Source code for smith_utils.text.newlines

from pathlib import Path
import tempfile
from typing import BinaryIO, Literal

NewlineType = Literal["LF", "CRLF", "CR", "MIXED", "NONE"]


[docs] def normalize_newlines_stream( src: BinaryIO, dst: BinaryIO, chunk_size: int = 1024 * 1024, ) -> NewlineType: """ Normalize newlines in a stream, replacing CR and CRLF with LF, and writing the result to the destination stream. This function processes a binary stream to unify newline characters into a single format (LF). It reads the source stream in chunks, processes each chunk to replace CR and CRLF with LF, and writes the normalized output to the destination stream. Additionally, it returns the newline type present in the original data: "LF", "CRLF", "CR", or "MIXED". If no newlines are present, it returns "NONE". Parameters: src (BinaryIO): The source stream to read binary data from. dst (BinaryIO): The destination stream to write the normalized binary data to. chunk_size (int): The size of chunks to read from the source stream. The default is 1024 * 1024 bytes (1 MB). Returns: NewlineType: A string representing the type of newline characters found in the source stream, which can be "LF", "CRLF", "CR", "MIXED", or "NONE". """ pending_cr = False lf_count = 0 crlf_count = 0 cr_count = 0 while True: chunk = src.read(chunk_size) if not chunk: break out = bytearray() i = 0 if pending_cr: if chunk.startswith(b"\n"): out.append(0x0A) crlf_count += 1 i = 1 else: out.append(0x0A) cr_count += 1 pending_cr = False while i < len(chunk): b = chunk[i] if b == 0x0D: # CR if i + 1 < len(chunk): if chunk[i + 1] == 0x0A: # CRLF out.append(0x0A) crlf_count += 1 i += 2 else: out.append(0x0A) cr_count += 1 i += 1 else: pending_cr = True i += 1 elif b == 0x0A: # LF out.append(0x0A) lf_count += 1 i += 1 else: out.append(b) i += 1 dst.write(out) if pending_cr: dst.write(b"\n") cr_count += 1 kinds = sum(bool(x) for x in (lf_count, crlf_count, cr_count)) if kinds == 0: return "NONE" if kinds > 1: return "MIXED" if crlf_count: return "CRLF" if lf_count: return "LF" return "CR"
[docs] def normalize_file_to_lf( input_path: Path, output_path: Path, *, chunk_size: int = 1024 * 1024, ) -> dict: """ Normalize line endings to LF in a streaming fashion. Returns: dict with: - newline_type: str (LF, CRLF, CR, MIXED, NONE) - bytes_in: int - bytes_out: int """ input_path = Path(input_path) output_path = Path(output_path) bytes_in = input_path.stat().st_size # Ensure output directory exists when writing to a different location. output_path.parent.mkdir(parents=True, exist_ok=True) same_file = False try: same_file = input_path.resolve() == output_path.resolve() except FileNotFoundError: # If either path can't be resolved (e.g., output doesn't exist yet), treat as different. same_file = False if same_file: # Avoid clobbering the source while reading. Write to a temp file in the same # directory and atomically replace the original. with tempfile.NamedTemporaryFile( mode="wb", dir=str(output_path.parent), prefix=f".{output_path.name}.", suffix=".tmp", delete=False, ) as tmp: tmp_path = Path(tmp.name) with input_path.open("rb") as src: newline_type = normalize_newlines_stream(src, tmp, chunk_size=chunk_size) bytes_out = tmp.tell() tmp_path.replace(output_path) else: with input_path.open("rb") as src, output_path.open("wb") as dst: newline_type = normalize_newlines_stream(src, dst, chunk_size=chunk_size) bytes_out = dst.tell() return {"newline_type": newline_type, "bytes_in": bytes_in, "bytes_out": bytes_out}