|
1 | 1 | from typing import List
|
2 | 2 |
|
3 |
| -from pdfplumber.page import CroppedPage, Page |
| 3 | +from pdfplumber.page import Page |
4 | 4 | from PIL import Image
|
5 | 5 |
|
6 | 6 | from .image import ImageSection, partition_image
|
7 |
| -from .pdf import Section, partition_page |
| 7 | +from .pdf import Section, PageSection, partition_page |
8 | 8 |
|
9 | 9 |
|
10 | 10 | # TODO: add padding argument
|
11 |
| -def segment_pdf_page(page: Page, debug: bool = False) -> List[CroppedPage]: |
| 11 | +def segment_pdf_page(page: Page, debug: bool = False, padding=1) -> List[PageSection]: |
12 | 12 | page_queue = [Section(page_crop=page, vertical_seg=True)]
|
13 | 13 |
|
14 | 14 | parsed_segments = []
|
@@ -36,7 +36,22 @@ def segment_pdf_page(page: Page, debug: bool = False) -> List[CroppedPage]:
|
36 | 36 |
|
37 | 37 | count += 1
|
38 | 38 |
|
39 |
| - return parsed_segments |
| 39 | + ret_parsed_segments = [] |
| 40 | + for crop in parsed_segments: |
| 41 | + bbox = crop.bbox |
| 42 | + if padding: |
| 43 | + bbox = ( |
| 44 | + max(0, bbox[0] - padding), |
| 45 | + max(0, bbox[1] - padding), |
| 46 | + min(bbox[2] + padding, page.width), |
| 47 | + min(bbox[3] + padding, page.height), |
| 48 | + ) |
| 49 | + |
| 50 | + ret_parsed_segments.append( |
| 51 | + PageSection(bounding_box=bbox, page_crop=page.crop(bbox, relative=False)) |
| 52 | + ) |
| 53 | + |
| 54 | + return ret_parsed_segments |
40 | 55 |
|
41 | 56 |
|
42 | 57 | def segment_pdf_image(page_image: Image.Image, padding=1) -> List[ImageSection]:
|
|
0 commit comments