init implementation of x-y segmentation

johnathanchiu · johnathanchiu · commit 8b750dfa7e62 · 2024-09-07T13:24:52.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,162 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
diff --git a/segmentor/document/scan.py b/segmentor/document/scan.py
@@ -0,0 +1,66 @@
+from typing import List
+
+import numpy as np
+from regex import R
+
+
+def page_scan(page_objs, page_dim, line_spacing=5.0, vertical_scan=True, debug=False):
+    # vertical scan implies the lines are going across the page dropped from top to bottom
+    p0, p1 = "top", "bottom"
+    if not vertical_scan:
+        p0, p1 = "x0", "x1"
+
+    scan_intersects = []
+    scan_lines = list(np.arange(*page_dim, line_spacing))
+    for scan_line in scan_lines:
+        is_crossed = False
+        for obj_type in page_objs:
+            # We only check objects that fall into these categories
+            if obj_type not in {"line", "curve", "rect", "char", "image"}:
+                continue
+            for obj in page_objs[obj_type]:
+                if obj[p0] < scan_line < obj[p1]:
+                    is_crossed = True
+                    scan_intersects.append(True)
+                    break
+            if is_crossed:
+                break
+
+        if not is_crossed:
+            scan_intersects.append(False)
+
+    debug_info = None
+    if debug:
+        debug_info = zip(scan_intersects, scan_lines)
+
+    return div_intersections(scan_intersects, scan_lines), debug_info
+
+
+def div_intersections(intersections: List[bool], scan_lines: List[float]):
+    """This function essentially splits a boolean list apart. For instance,
+    `intersections` will be a boolean list. It will segement the consecutive sections
+    of `True` groups. This will then be used to match with `scan_lines` to get
+    the start and end sections of the page.
+    """
+    section_ints = []
+    section_start = None
+
+    for i, intersects in enumerate(intersections):
+        if intersects:
+            if section_start is None:
+                section_start = i
+        elif section_start is not None:
+            section_ints.append((section_start, i - 1))
+            section_start = None
+
+    if section_start is not None:
+        section_ints.append((section_start, len(intersections) - 1))
+
+    section_crop_dims = []
+    for section_int in section_ints:
+        start, end = section_int
+        p1 = scan_lines[min(end + 1, len(scan_lines) - 1)]
+        p0 = scan_lines[max(start - 1, 0)]
+        section_crop_dims.append((p0, p1))
+
+    return section_crop_dims
diff --git a/segmentor/document/segment.py b/segmentor/document/segment.py
@@ -0,0 +1,105 @@
+from dataclasses import dataclass
+from typing import List, Tuple
+
+from pdfplumber.page import CroppedPage, Page
+from PIL import Image
+
+from .scan import page_scan
+
+
+@dataclass
+class Section:
+    page_crop: CroppedPage
+    vertical_seg: bool
+    seg_depth: int = 0
+
+
+class ImageSection:
+    bounding_box: Tuple[int, int, int, int]
+
+
+def section_page(
+    page: Section, page_breaks, vertical_div=True, debug_info=None
+) -> List[CroppedPage]:
+    if debug_info:
+        im = page.page_crop.to_image()
+        for ints, loc in debug_info:
+            if ints:
+                if vertical_div:
+                    im.draw_hline(loc)
+                else:
+                    im.draw_vline(loc)
+        im.show()
+
+    page_bbox = page.page_crop.bbox
+
+    page_crops = []
+    for section in reversed(page_breaks):
+        p0, p1 = section
+
+        if vertical_div:
+            div_tup = (page_bbox[0], p0, page_bbox[2], p1)
+        else:
+            div_tup = (p0, page_bbox[1], p1, page_bbox[3])
+
+        crop = page.page_crop.crop(div_tup, relative=False)
+        page_crops.append(crop)
+
+    return page_crops
+
+
+# TODO: Support image operations for page scan
+def partition_page(page: Section, debug=False) -> List[CroppedPage]:
+    page_bbox = page.page_crop.bbox
+    if page.vertical_seg:
+        page_dim = (page_bbox[1], page_bbox[3])
+        line_spacing = 5.0  # arbitrary hyperparameters
+    else:
+        page_dim = (page_bbox[0], page_bbox[2])
+        line_spacing = 8.0  # arbitrary hyperparameters
+
+    page_breaks, debug_info = page_scan(
+        page.page_crop.objects,
+        page_dim,
+        vertical_scan=page.vertical_seg,
+        line_spacing=line_spacing,
+        debug=debug,
+    )
+    return section_page(
+        page, page_breaks, vertical_div=page.vertical_seg, debug_info=debug_info
+    )
+
+
+def partition_image(page: Image, vertical_scan=True):
+    raise NotImplementedError()
+
+
+def segment_pdf_page(page: Page, debug=False) -> List[CroppedPage]:
+    page_queue = [Section(page_crop=page, vertical_seg=True)]
+
+    parsed_segments = []
+
+    count = 0
+    while page_queue:
+        # Get the next crop in the queue
+        curr_crop = page_queue.pop(0)
+
+        # Partition the next crop by the opposite method
+        crops = partition_page(curr_crop, debug=debug)
+
+        # if the page cannot be partitioned further than insert it directly into `parsed_segments`
+        if len(crops) == 1:
+            parsed_segments.append(crops.pop())
+
+        for crop in crops:
+            page_queue.append(
+                Section(
+                    page_crop=crop,
+                    vertical_seg=not curr_crop.vertical_seg,
+                    seg_depth=curr_crop.seg_depth + 1,
+                )
+            )
+
+        count += 1
+
+    return parsed_segments