diff --git a/.gitignore b/.gitignore index f8b73e7..8795b30 100644 --- a/.gitignore +++ b/.gitignore @@ -138,3 +138,6 @@ dmypy.json # Cython debug symbols cython_debug/ + +# Other +*.pdf diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..2c07333 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.11 diff --git a/pdf_split.py b/pdf_split.py new file mode 100644 index 0000000..a6e3f12 --- /dev/null +++ b/pdf_split.py @@ -0,0 +1,116 @@ +from pypdf import PdfReader, PdfWriter +from pypdf._page import Transformation + + +def resize_pdf_page(page, scale_factor): + """ + Resize or rescale a single PDF page, including its content and canvas. + + Args: + page (PageObject): The page to be resized. + scale_factor (float): The factor by which to scale the page. + e.g., 0.5 for 50% size, 2.0 for 200% size. + + Returns: + PageObject: The resized page. + """ + # Apply scaling transformation to the page content + transformation = Transformation().scale(scale_factor, scale_factor) + page.add_transformation(transformation) + + # Adjust the media box to reflect the new dimensions + media_box = page.mediabox + new_width = media_box.width * scale_factor + new_height = media_box.height * scale_factor + page.mediabox.lower_left = (media_box.lower_left[0], media_box.lower_left[1]) + page.mediabox.upper_right = (new_width, new_height) + + return page + + +def split_pdf_vertically( + input_pdf, + output_pdf_prefix, + cut_points=None, + new_width: float = 0, + single_output=True, +): + """ + Splits a PDF vertically and optionally resizes it. + + Args: + input_pdf (str): Path to the input PDF file. + output_pdf_prefix (str): Prefix for output PDF files. + cut_points (list of tuples): List of (start_y, end_y) ratios for vertical splitting. + new_width (float): Desired width of the output PDF in inches. Default is no resizing. + single_output (bool): If True, outputs a single PDF with multiple pages. + If False, outputs a separate PDF for each segment. + """ + if cut_points is None: + cut_points = [(0, 1)] + + reader = PdfReader(input_pdf) + scale_factor = 1.0 + original_page = reader.pages[0] + + # Resize the page if a new width is specified + if new_width: + scale_factor = (new_width * 72) / original_page.mediabox.width + original_page = resize_pdf_page(original_page, scale_factor) + + media_box = original_page.mediabox + h = original_page.mediabox.height + + # Create a single writer for combined output, if needed + combined_writer = PdfWriter() if single_output else None + if not new_width: + _nw = original_page.mediabox.width // 72 + else: + _nw = new_width + for i, (start_y, end_y) in enumerate(reversed(cut_points)): + # Convert relative coordinates to absolute coordinates + start_y = int(h * start_y) + end_y = int(h * end_y) + + # Clone and crop the page + writer = PdfWriter() + new_page = original_page.clone(writer) + new_page.mediabox.upper_right = (media_box.upper_right[0], start_y) + new_page.mediabox.lower_left = (media_box.lower_left[0], end_y) + if single_output: + combined_writer.add_page(new_page) + else: + writer.add_page(new_page) + with open( + f"{output_pdf_prefix}W{_nw:03d}_{start_y}-{end_y}.pdf", "wb" + ) as f: + writer.write(f) + + # Write the single output file, if applicable + if single_output: + _ct_pts = set(sum([[*k] for k in cut_points], [])) + ct = "-".join(sorted(list(map(lambda c: str(int(c * h)), _ct_pts)))) + + with open(f"{output_pdf_prefix}W{_nw:03d}_{ct}_combined.pdf", "wb") as f: + combined_writer.write(f) + + +if __name__ == "__main__": + # Usage: + input_pdf = "sample.pdf" + output_pdf_prefix = "split" + # Specify (start_y, end_y) for each segment + cut_points = [(0, 0.0949358), (0.0949358, 0.2)] + split_pdf_vertically( + input_pdf, output_pdf_prefix, cut_points, new_width=None, single_output=False + ) + split_pdf_vertically( + input_pdf, "resize", cut_points=None, new_width=32, single_output=True + ) + # split_pdf_vertically( + # input_pdf, + # output_pdf_prefix, + # cut_points=[(0, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1)], + # new_width=32, + # single_output=False, + # ) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..35a8100 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,10 @@ +[project] +name = "pdf-utils" +version = "0.0.1" +description = "Slim PDF Utilities" +readme = "README.md" +requires-python = ">=3.10" +dependencies = [ + "pypdf>=5.1.0", +] + diff --git a/uv.lock b/uv.lock new file mode 100644 index 0000000..61f42f9 --- /dev/null +++ b/uv.lock @@ -0,0 +1,34 @@ +version = 1 +requires-python = ">=3.10" + +[[package]] +name = "pdf-utils" +version = "0.0.1" +source = { virtual = "." } +dependencies = [ + { name = "pypdf" }, +] + +[package.metadata] +requires-dist = [{ name = "pypdf", specifier = ">=5.1.0" }] + +[[package]] +name = "pypdf" +version = "5.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6b/9a/72d74f05f64895ebf1c7f6646cf7fe6dd124398c5c49240093f92d6f0fdd/pypdf-5.1.0.tar.gz", hash = "sha256:425a129abb1614183fd1aca6982f650b47f8026867c0ce7c4b9f281c443d2740", size = 5011381 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/fc/6f52588ac1cb4400a7804ef88d0d4e00cfe57a7ac6793ec3b00de5a8758b/pypdf-5.1.0-py3-none-any.whl", hash = "sha256:3bd4f503f4ebc58bae40d81e81a9176c400cbbac2ba2d877367595fb524dfdfc", size = 297976 }, +] + +[[package]] +name = "typing-extensions" +version = "4.12.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/df/db/f35a00659bc03fec321ba8bce9420de607a1d37f8342eee1863174c69557/typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8", size = 85321 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d", size = 37438 }, +]