Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 5404797

Browse files
committedSep 26, 2024
add padding param for pdf documents
1 parent 285d1ac commit 5404797

File tree

2 files changed

+26
-5
lines changed

2 files changed

+26
-5
lines changed
 

‎segmentor/document/pdf.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from dataclasses import dataclass
2-
from typing import List
2+
from typing import List, Tuple
33

44
from pdfplumber.page import CroppedPage
55
import numpy as np
@@ -14,6 +14,12 @@ class Section:
1414
seg_depth: int = 0
1515

1616

17+
@dataclass
18+
class PageSection:
19+
bounding_box: Tuple[int, int, int, int]
20+
page_crop: CroppedPage
21+
22+
1723
def check_object_intersections(page_objs, scan_line, p0, p1):
1824
is_crossed = False
1925
for obj_type in page_objs:

‎segmentor/document/segment.py

+19-4
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
from typing import List
22

3-
from pdfplumber.page import CroppedPage, Page
3+
from pdfplumber.page import Page
44
from PIL import Image
55

66
from .image import ImageSection, partition_image
7-
from .pdf import Section, partition_page
7+
from .pdf import Section, PageSection, partition_page
88

99

1010
# TODO: add padding argument
11-
def segment_pdf_page(page: Page, debug: bool = False) -> List[CroppedPage]:
11+
def segment_pdf_page(page: Page, debug: bool = False, padding=1) -> List[PageSection]:
1212
page_queue = [Section(page_crop=page, vertical_seg=True)]
1313

1414
parsed_segments = []
@@ -36,7 +36,22 @@ def segment_pdf_page(page: Page, debug: bool = False) -> List[CroppedPage]:
3636

3737
count += 1
3838

39-
return parsed_segments
39+
ret_parsed_segments = []
40+
for crop in parsed_segments:
41+
bbox = crop.bbox
42+
if padding:
43+
bbox = (
44+
max(0, bbox[0] - padding),
45+
max(0, bbox[1] - padding),
46+
min(bbox[2] + padding, page.width),
47+
min(bbox[3] + padding, page.height),
48+
)
49+
50+
ret_parsed_segments.append(
51+
PageSection(bounding_box=bbox, page_crop=page.crop(bbox, relative=False))
52+
)
53+
54+
return ret_parsed_segments
4055

4156

4257
def segment_pdf_image(page_image: Image.Image, padding=1) -> List[ImageSection]:

0 commit comments

Comments
 (0)