Skip to content

Commit 8b750df

Browse files
committed
init implementation of x-y segmentation
0 parents  commit 8b750df

File tree

3 files changed

+333
-0
lines changed

3 files changed

+333
-0
lines changed

.gitignore

+162
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
share/python-wheels/
24+
*.egg-info/
25+
.installed.cfg
26+
*.egg
27+
MANIFEST
28+
29+
# PyInstaller
30+
# Usually these files are written by a python script from a template
31+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
32+
*.manifest
33+
*.spec
34+
35+
# Installer logs
36+
pip-log.txt
37+
pip-delete-this-directory.txt
38+
39+
# Unit test / coverage reports
40+
htmlcov/
41+
.tox/
42+
.nox/
43+
.coverage
44+
.coverage.*
45+
.cache
46+
nosetests.xml
47+
coverage.xml
48+
*.cover
49+
*.py,cover
50+
.hypothesis/
51+
.pytest_cache/
52+
cover/
53+
54+
# Translations
55+
*.mo
56+
*.pot
57+
58+
# Django stuff:
59+
*.log
60+
local_settings.py
61+
db.sqlite3
62+
db.sqlite3-journal
63+
64+
# Flask stuff:
65+
instance/
66+
.webassets-cache
67+
68+
# Scrapy stuff:
69+
.scrapy
70+
71+
# Sphinx documentation
72+
docs/_build/
73+
74+
# PyBuilder
75+
.pybuilder/
76+
target/
77+
78+
# Jupyter Notebook
79+
.ipynb_checkpoints
80+
81+
# IPython
82+
profile_default/
83+
ipython_config.py
84+
85+
# pyenv
86+
# For a library or package, you might want to ignore these files since the code is
87+
# intended to run in multiple environments; otherwise, check them in:
88+
# .python-version
89+
90+
# pipenv
91+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
93+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
94+
# install all needed dependencies.
95+
#Pipfile.lock
96+
97+
# poetry
98+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99+
# This is especially recommended for binary packages to ensure reproducibility, and is more
100+
# commonly ignored for libraries.
101+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102+
#poetry.lock
103+
104+
# pdm
105+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106+
#pdm.lock
107+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108+
# in version control.
109+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110+
.pdm.toml
111+
.pdm-python
112+
.pdm-build/
113+
114+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115+
__pypackages__/
116+
117+
# Celery stuff
118+
celerybeat-schedule
119+
celerybeat.pid
120+
121+
# SageMath parsed files
122+
*.sage.py
123+
124+
# Environments
125+
.env
126+
.venv
127+
env/
128+
venv/
129+
ENV/
130+
env.bak/
131+
venv.bak/
132+
133+
# Spyder project settings
134+
.spyderproject
135+
.spyproject
136+
137+
# Rope project settings
138+
.ropeproject
139+
140+
# mkdocs documentation
141+
/site
142+
143+
# mypy
144+
.mypy_cache/
145+
.dmypy.json
146+
dmypy.json
147+
148+
# Pyre type checker
149+
.pyre/
150+
151+
# pytype static type analyzer
152+
.pytype/
153+
154+
# Cython debug symbols
155+
cython_debug/
156+
157+
# PyCharm
158+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160+
# and can be added to the global gitignore or merged into this file. For a more nuclear
161+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
162+
#.idea/

segmentor/document/scan.py

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
from typing import List
2+
3+
import numpy as np
4+
from regex import R
5+
6+
7+
def page_scan(page_objs, page_dim, line_spacing=5.0, vertical_scan=True, debug=False):
8+
# vertical scan implies the lines are going across the page dropped from top to bottom
9+
p0, p1 = "top", "bottom"
10+
if not vertical_scan:
11+
p0, p1 = "x0", "x1"
12+
13+
scan_intersects = []
14+
scan_lines = list(np.arange(*page_dim, line_spacing))
15+
for scan_line in scan_lines:
16+
is_crossed = False
17+
for obj_type in page_objs:
18+
# We only check objects that fall into these categories
19+
if obj_type not in {"line", "curve", "rect", "char", "image"}:
20+
continue
21+
for obj in page_objs[obj_type]:
22+
if obj[p0] < scan_line < obj[p1]:
23+
is_crossed = True
24+
scan_intersects.append(True)
25+
break
26+
if is_crossed:
27+
break
28+
29+
if not is_crossed:
30+
scan_intersects.append(False)
31+
32+
debug_info = None
33+
if debug:
34+
debug_info = zip(scan_intersects, scan_lines)
35+
36+
return div_intersections(scan_intersects, scan_lines), debug_info
37+
38+
39+
def div_intersections(intersections: List[bool], scan_lines: List[float]):
40+
"""This function essentially splits a boolean list apart. For instance,
41+
`intersections` will be a boolean list. It will segement the consecutive sections
42+
of `True` groups. This will then be used to match with `scan_lines` to get
43+
the start and end sections of the page.
44+
"""
45+
section_ints = []
46+
section_start = None
47+
48+
for i, intersects in enumerate(intersections):
49+
if intersects:
50+
if section_start is None:
51+
section_start = i
52+
elif section_start is not None:
53+
section_ints.append((section_start, i - 1))
54+
section_start = None
55+
56+
if section_start is not None:
57+
section_ints.append((section_start, len(intersections) - 1))
58+
59+
section_crop_dims = []
60+
for section_int in section_ints:
61+
start, end = section_int
62+
p1 = scan_lines[min(end + 1, len(scan_lines) - 1)]
63+
p0 = scan_lines[max(start - 1, 0)]
64+
section_crop_dims.append((p0, p1))
65+
66+
return section_crop_dims

segmentor/document/segment.py

+105
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
from dataclasses import dataclass
2+
from typing import List, Tuple
3+
4+
from pdfplumber.page import CroppedPage, Page
5+
from PIL import Image
6+
7+
from .scan import page_scan
8+
9+
10+
@dataclass
11+
class Section:
12+
page_crop: CroppedPage
13+
vertical_seg: bool
14+
seg_depth: int = 0
15+
16+
17+
class ImageSection:
18+
bounding_box: Tuple[int, int, int, int]
19+
20+
21+
def section_page(
22+
page: Section, page_breaks, vertical_div=True, debug_info=None
23+
) -> List[CroppedPage]:
24+
if debug_info:
25+
im = page.page_crop.to_image()
26+
for ints, loc in debug_info:
27+
if ints:
28+
if vertical_div:
29+
im.draw_hline(loc)
30+
else:
31+
im.draw_vline(loc)
32+
im.show()
33+
34+
page_bbox = page.page_crop.bbox
35+
36+
page_crops = []
37+
for section in reversed(page_breaks):
38+
p0, p1 = section
39+
40+
if vertical_div:
41+
div_tup = (page_bbox[0], p0, page_bbox[2], p1)
42+
else:
43+
div_tup = (p0, page_bbox[1], p1, page_bbox[3])
44+
45+
crop = page.page_crop.crop(div_tup, relative=False)
46+
page_crops.append(crop)
47+
48+
return page_crops
49+
50+
51+
# TODO: Support image operations for page scan
52+
def partition_page(page: Section, debug=False) -> List[CroppedPage]:
53+
page_bbox = page.page_crop.bbox
54+
if page.vertical_seg:
55+
page_dim = (page_bbox[1], page_bbox[3])
56+
line_spacing = 5.0 # arbitrary hyperparameters
57+
else:
58+
page_dim = (page_bbox[0], page_bbox[2])
59+
line_spacing = 8.0 # arbitrary hyperparameters
60+
61+
page_breaks, debug_info = page_scan(
62+
page.page_crop.objects,
63+
page_dim,
64+
vertical_scan=page.vertical_seg,
65+
line_spacing=line_spacing,
66+
debug=debug,
67+
)
68+
return section_page(
69+
page, page_breaks, vertical_div=page.vertical_seg, debug_info=debug_info
70+
)
71+
72+
73+
def partition_image(page: Image, vertical_scan=True):
74+
raise NotImplementedError()
75+
76+
77+
def segment_pdf_page(page: Page, debug=False) -> List[CroppedPage]:
78+
page_queue = [Section(page_crop=page, vertical_seg=True)]
79+
80+
parsed_segments = []
81+
82+
count = 0
83+
while page_queue:
84+
# Get the next crop in the queue
85+
curr_crop = page_queue.pop(0)
86+
87+
# Partition the next crop by the opposite method
88+
crops = partition_page(curr_crop, debug=debug)
89+
90+
# if the page cannot be partitioned further than insert it directly into `parsed_segments`
91+
if len(crops) == 1:
92+
parsed_segments.append(crops.pop())
93+
94+
for crop in crops:
95+
page_queue.append(
96+
Section(
97+
page_crop=crop,
98+
vertical_seg=not curr_crop.vertical_seg,
99+
seg_depth=curr_crop.seg_depth + 1,
100+
)
101+
)
102+
103+
count += 1
104+
105+
return parsed_segments

0 commit comments

Comments
 (0)