-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathscrollshot2pdf.py
More file actions
726 lines (606 loc) · 24.8 KB
/
scrollshot2pdf.py
File metadata and controls
726 lines (606 loc) · 24.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
import argparse
import os
import sys
from PIL import Image
from reportlab.lib import pagesizes
from reportlab.pdfgen import canvas
# Try to import pytesseract but make it optional
try:
import pytesseract
TESSERACT_AVAILABLE = True
except ImportError:
TESSERACT_AVAILABLE = False
# Get all paper sizes from reportlab
PAGE_SIZES = {
name.lower(): getattr(pagesizes, name)
for name in dir(pagesizes)
if name.isupper() and isinstance(getattr(pagesizes, name), tuple)
}
# Add landscape variants
PAGE_SIZES.update({f"{name}-landscape": (height, width) for name, (width, height) in PAGE_SIZES.items()})
def mm_to_pixels(mm: float, dpi: int = 72) -> int:
"""Convert millimeters to pixels at given DPI."""
return int(mm * dpi / 25.4)
def mm_to_points(mm: float) -> float:
"""Convert millimeters to points (1/72 inch)."""
return mm * 72 / 25.4 # 1 inch = 25.4mm, 1 inch = 72 points
def parse_margin(margin_str: str) -> float:
"""Parse margin string (either pixels or mm) to points."""
if margin_str.endswith("mm"):
return mm_to_points(float(margin_str[:-2]))
elif margin_str.endswith("px"):
return float(margin_str[:-2]) * 72 / 96 # Assuming 96 DPI for pixels
else:
try:
return float(margin_str) * 72 / 96 # Treat bare numbers as pixels
except ValueError as err:
raise ValueError("Margin must be specified in px, mm, or as plain pixels") from err
def trim_whitespace(image: Image.Image) -> Image.Image:
"""Trim whitespace from image edges."""
# Convert to RGB if image is in RGBA mode
if image.mode == "RGBA":
background = Image.new("RGBA", image.size, (255, 255, 255, 255))
background.paste(image, mask=image.split()[3]) # Use alpha channel as mask
image = background.convert("RGB")
elif image.mode != "RGB":
image = image.convert("RGB")
# Get the bounding box of non-white pixels
bbox = image.getbbox()
if bbox:
return image.crop(bbox)
return image
def find_content_gaps(image: Image.Image, min_gap_size: int = 50, blank_ratio: float = 0.0) -> list[int]:
"""
Find vertical positions where there are gaps in content.
Returns a list of y-coordinates where gaps occur.
Args:
image: Image to analyze
min_gap_size: Minimum gap size in pixels to consider for page breaks
blank_ratio: Ratio of non-blank to blank pixels allowed in blank lines
(0.0=strict, 0.1=10% non-blank allowed)
"""
# Convert to grayscale for analysis
gray = image.convert("L")
width, height = gray.size
# Get image data
pixels = list(gray.getdata())
# Check each row for content
gaps = []
current_gap_start = None
for y in range(height):
row_start = y * width
row_end = row_start + width
row = pixels[row_start:row_end]
# Check if row is empty based on blank_ratio
if blank_ratio == 0.0:
# Strict mode: all pixels must be > 250 (nearly white)
is_empty = all(p > 250 for p in row)
else:
# Ratio mode: allow some non-blank pixels
non_blank_pixels = sum(1 for p in row if p <= 250)
non_blank_ratio = non_blank_pixels / len(row)
is_empty = non_blank_ratio <= blank_ratio
if is_empty and current_gap_start is None:
current_gap_start = y
elif not is_empty and current_gap_start is not None:
gap_size = y - current_gap_start
if gap_size >= min_gap_size:
gaps.append(current_gap_start + gap_size // 2) # Add middle of gap
current_gap_start = None
return gaps
def calculate_slices(
image_height: int, page_height: int, content_gaps: list[int], no_split_content: bool = False
) -> list[tuple[int, int]]:
"""
Calculate optimal slice positions based on page height and content gaps.
Returns list of (start_y, end_y) tuples.
"""
slices = []
current_pos = 0
while current_pos < image_height:
# Calculate ideal next slice position
ideal_next_pos = min(current_pos + page_height, image_height)
# If we're at the end, add the final slice
if ideal_next_pos >= image_height:
slices.append((current_pos, image_height))
break
# Find the last content gap before the ideal split point.
best_gap = None
for gap in content_gaps:
if current_pos < gap < ideal_next_pos:
best_gap = gap
next_pos = ideal_next_pos
if best_gap is not None:
# If a gap was found within the page, use it to avoid splitting content.
next_pos = best_gap
elif no_split_content:
# If in strict no-split mode and no gap was found, find the very next
# available gap in the image, even if it makes a very long page.
next_available_gap = None
for gap in content_gaps:
if gap > current_pos:
next_available_gap = gap
break
if next_available_gap is not None:
next_pos = next_available_gap
else:
# No more gaps in the entire image, so this slice must go to the end.
next_pos = image_height
slices.append((current_pos, next_pos))
current_pos = next_pos
return slices
def title_from_filename(filename: str) -> str:
"""Convert filename to title, without extension and titlecased if all lowercase."""
# Remove extension and directory path
base = os.path.splitext(os.path.basename(filename))[0]
# Only titlecase if the string is all lowercase
if base.islower():
return base.replace("_", " ").replace("-", " ").title()
return base.replace("_", " ").replace("-", " ")
def parse_page_range(range_str: str, total_pages: int) -> tuple[int, int]:
"""Parse page range string into start and end page numbers (1-based)."""
if not range_str:
return 1, total_pages
try:
if "-" in range_str:
start_str, end_str = range_str.split("-", 1)
start = int(start_str) if start_str else 1
end = int(end_str) if end_str else total_pages
else:
start = end = int(range_str)
# Validate range
if start < 1 or end > total_pages or start > end:
raise ValueError
return start, end
except ValueError as err:
raise ValueError(f"Invalid page range. Format: N, N-M, N-, -M (1 to {total_pages})") from err
def calculate_optimal_columns(image_width: float, usable_width: float, image: Image.Image, debug: bool = False) -> int:
"""
Calculate optimal number of columns to minimize scaling while fitting page width.
Accounts for DPI differences between image and PDF output.
"""
# Get image DPI from metadata, default to 72 if not specified
try:
image_dpi = image.info.get("dpi", (72, 72))[0] # Get horizontal DPI
except (AttributeError, TypeError):
image_dpi = 72
output_dpi = 300 # PDF target DPI
# Convert image width to points at output DPI
image_width_at_output_dpi = (image_width * 72) / image_dpi # Convert to points
if debug:
print("\nColumn calculation:")
print(f"Image width: {image_width:.1f}px")
print(f"Image DPI: {image_dpi}")
print(f"Image width at {output_dpi} DPI: {image_width_at_output_dpi:.1f}pt")
print(f"Usable page width: {usable_width:.1f}pt")
# Start with 1 column and increase until we find a solution
for columns in range(1, 11): # Limit to 10 columns max
# Calculate column width with gaps
total_gap_width = 20.0 * (columns - 1) # Using default column gap
column_width = (usable_width - total_gap_width) / columns
# Calculate scale factor based on output DPI
scale_factor = column_width / image_width_at_output_dpi
inverse_scale = 1 / scale_factor
if debug:
print(f"\nTrying {columns} column{'s' if columns > 1 else ''}:")
print(f" Column width: {column_width:.1f}pt")
print(f" Scale factor: {scale_factor:.3f} (1/{inverse_scale:.1f})")
# If scale factor is >= 1, we can use 1 column
if scale_factor >= 1:
if debug:
print(" ✓ Image fits at original size")
return 1
# If inverse of scale factor is close to an integer, we found our solution
if abs(round(inverse_scale) - inverse_scale) < 0.01:
if debug:
print(f" ✓ Clean scaling factor found: 1/{round(inverse_scale)}")
return columns
elif debug:
print(" ✗ No clean scaling factor")
# If no good solution found, default to minimum scaling
if debug:
print("\nNo optimal solution found, defaulting to 1 column")
return 1
def add_ocr_layer(
image: Image.Image,
canvas_obj: canvas.Canvas,
x: float,
y: float,
width: float,
height: float,
lang: str = "eng+jpn",
) -> None:
"""Add searchable text layer using OCR while preserving original image."""
# Configure OCR parameters for better mixed-language detection
custom_config = r"--oem 3 --psm 3" # Use LSTM OCR Engine Mode and Auto-page segmentation
# Get OCR data with bounding boxes
data = pytesseract.image_to_data(image, lang=lang, config=custom_config, output_type=pytesseract.Output.DICT)
# Save canvas state
canvas_obj.saveState()
# Set text color to transparent
canvas_obj.setFillColorRGB(0, 0, 0, 0)
# Calculate scale factors
scale_x = width / image.size[0]
scale_y = height / image.size[1]
# Group text by block_num to maintain text flow
blocks = {}
for i in range(len(data["text"])):
if data["conf"][i] > 0: # Only process text with confidence
block_num = data["block_num"][i]
if block_num not in blocks:
blocks[block_num] = []
blocks[block_num].append(i)
# Process each block
for block_indices in blocks.values():
for i in block_indices:
text = data["text"][i]
if not text.strip():
continue
# Get text dimensions and position
left = data["left"][i]
top = data["top"][i]
data["width"][i]
height_text = data["height"][i]
# Convert coordinates to PDF space
box_x = x + (left * scale_x)
# Adjust y position to align with text baseline
baseline_offset = height_text * 0.2
box_y = y + height - ((top + height_text - baseline_offset) * scale_y)
# Set font size based on the OCR'd text height
font_size = height_text * scale_y
canvas_obj.setFont("Helvetica", font_size)
# Add invisible text
canvas_obj.drawString(box_x, box_y, text)
# Restore state
canvas_obj.restoreState()
# Note: We don't draw the image here anymore, as it's handled by the calling function
def create_pdf(
image: Image.Image,
output_path: str,
page_size: tuple[float, float],
margin_points: float,
min_gap_size: int = 50,
*,
blank_ratio: float = 0.0,
columns: int | None = None,
column_gap: float = 20.0,
add_page_numbers: bool = True,
number_position: str = "bottom-left",
number_font: str = "Helvetica",
number_size: int = 10,
skip_first_number: bool = True,
title: str | None = None,
title_position: str = "center",
title_font: str = "Helvetica-Bold",
title_size: int = 14,
page_range: str | None = None,
enable_ocr: bool = False,
ocr_lang: str = "eng",
debug: bool = False,
no_split_content: bool = False,
) -> None:
"""Create PDF from image with optional OCR layer."""
if enable_ocr and not TESSERACT_AVAILABLE:
print(
"Error: OCR was requested but pytesseract is not installed.",
file=sys.stderr,
)
print("To use OCR, install scrollshot2pdf with OCR support:", file=sys.stderr)
print(
' pip install "git+https://github.com/osteele/scrollshot2pdf.git[ocr]"',
file=sys.stderr,
)
sys.exit(1)
# Check if tesseract executable is available when OCR is enabled
if enable_ocr and not TESSERACT_AVAILABLE:
print(
"Error: OCR was requested but pytesseract is not installed.",
file=sys.stderr,
)
print("To use OCR, install scrollshot2pdf with OCR support:", file=sys.stderr)
print(
' pip install "git+https://github.com/osteele/scrollshot2pdf.git[ocr]"',
file=sys.stderr,
)
sys.exit(1)
# Check if tesseract executable is available when OCR is enabled
if enable_ocr:
try:
pytesseract.get_tesseract_version()
except pytesseract.TesseractNotFoundError:
print(
"Error: Tesseract is not installed. Please install tesseract-ocr first:",
file=sys.stderr,
)
print("Ubuntu/Debian: sudo apt-get install tesseract-ocr", file=sys.stderr)
print("macOS: brew install tesseract", file=sys.stderr)
sys.exit(1)
print("Analyzing image dimensions and calculating layout...")
page_width, page_height = page_size
usable_width = page_width - 2 * margin_points
usable_height = page_height - 2 * margin_points
# Calculate optimal columns if not specified
if columns is None:
columns = calculate_optimal_columns(image.size[0], usable_width, image, debug)
print(f"Automatically selected {columns} column{'s' if columns > 1 else ''}")
# Calculate column width
total_gap_width = column_gap * (columns - 1)
column_width = (usable_width - total_gap_width) / columns
# Calculate scale factor to fit column width while preserving aspect ratio
scale_factor = column_width / image.size[0]
# Find content gaps in original image (no scaling)
print("Finding content gaps for optimal page breaks...")
content_gaps = find_content_gaps(image, min_gap_size, blank_ratio)
# Calculate slice positions using scaled height
scaled_usable_height = int(usable_height / scale_factor)
slices = calculate_slices(image.size[1], scaled_usable_height, content_gaps, no_split_content)
# Calculate total pages needed based on columns
total_slices = len(slices)
total_pages = (total_slices + columns - 1) // columns # Ceiling division
print(f"Image will be split into {total_slices} slices across {total_pages} pages")
# Parse page range
if page_range:
try:
start_page, end_page = parse_page_range(page_range, total_pages)
except ValueError as e:
print(f"Error: {e!s}", file=sys.stderr)
sys.exit(1)
# Filter slices based on page range
start_slice = (start_page - 1) * columns
end_slice = min(end_page * columns, total_slices)
slices = slices[start_slice:end_slice]
def add_page_number(canvas, page_num):
if skip_first_number and page_num == 1:
return
# Calculate position based on margins and position choice
if "bottom" in number_position:
y = margin_points + number_size / 2
else: # top
y = page_height - margin_points - number_size / 2
if "left" in number_position:
x = margin_points
else: # right
x = page_width - margin_points
canvas.setRightMargin(margin_points)
canvas.setFont(number_font, number_size)
text = str(page_num)
if "right" in number_position:
text_width = canvas.stringWidth(text, number_font, number_size)
x -= text_width
canvas.drawString(x, y, text)
def add_title(canvas):
if not title:
return
canvas.setFont(title_font, title_size)
text_width = canvas.stringWidth(title, title_font, title_size)
# Calculate y position (at top of page)
y = page_height - margin_points - title_size
# Calculate x position based on alignment
if title_position == "left":
x = margin_points
elif title_position == "right":
x = page_width - margin_points - text_width
else: # center
x = (page_width - text_width) / 2
canvas.drawString(x, y, title)
# Create PDF
print(f"Creating PDF: {output_path}")
c = canvas.Canvas(output_path, pagesize=page_size)
# Add title to first page only
add_title(c)
current_page = 1
for i in range(0, len(slices), columns):
print(f"Processing page {current_page}/{total_pages}...")
# Process each column in the current page
for col in range(columns):
if i + col >= len(slices):
break
start_y, end_y = slices[i + col]
slice_height = end_y - start_y
scaled_slice_height = slice_height * scale_factor
# Safety check for --no-split-content mode
if no_split_content and scaled_slice_height > usable_height:
print(
"Error: A content block is too tall to fit on a single page, and --no-split-content is enabled.",
file=sys.stderr,
)
print("To resolve this, you can:", file=sys.stderr)
print(
" 1. Remove the --no-split-content flag to allow the content to be split across pages.",
file=sys.stderr,
)
print(
" 2. Increase the page height by selecting a larger --page-size (e.g., 'legal' or 'a3-landscape').",
file=sys.stderr,
)
print(
" 3. If your image has small whitespace breaks, try a smaller --min-gap value to detect them as split points.",
file=sys.stderr,
)
print(
" 4. If your image's blank areas have noise, try --blank-ratio to allow for imperfectly blank lines.",
file=sys.stderr,
)
sys.exit(1)
slice_img = image.crop((0, start_y, image.size[0], end_y))
# Save temporary slice
temp_slice_path = f"temp_slice_{i + col}.png"
slice_img.save(temp_slice_path)
# Calculate position for this column
x_pos = margin_points + (column_width + column_gap) * col
scaled_slice_height = slice_height * scale_factor
if enable_ocr:
# Add OCR layer before drawing image
add_ocr_layer(
slice_img,
c,
x_pos,
page_height - scaled_slice_height - margin_points,
column_width,
scaled_slice_height,
lang=ocr_lang,
)
# Draw the image (OCR layer will be underneath)
c.drawImage(
temp_slice_path,
x_pos,
page_height - scaled_slice_height - margin_points,
width=column_width,
height=scaled_slice_height,
)
# Remove temporary file
os.remove(temp_slice_path)
# Add page number if enabled
if add_page_numbers:
add_page_number(c, current_page)
# Add new page if not last page
if i + columns < len(slices):
c.showPage()
current_page += 1
c.save()
print("PDF creation complete!")
def main():
parser = argparse.ArgumentParser(description="Convert tall image to multi-page PDF")
parser.add_argument("input_file", help="Input image file")
parser.add_argument("--output", "-o", help="Output PDF file (default: input_name.pdf)")
parser.add_argument(
"--page-size",
"-p",
choices=sorted(PAGE_SIZES.keys()),
default="a4",
help="Page size (default: a4)",
)
parser.add_argument("--margin", "-m", default="10mm", help="Margin size in px or mm (default: 10mm)")
parser.add_argument(
"--min-gap",
"-g",
type=int,
default=50,
help="Minimum gap size in pixels to consider for page breaks (default: 50)",
)
parser.add_argument(
"--blank-ratio",
"-b",
type=float,
default=0.0,
help="Ratio of non-blank to blank pixels allowed in blank lines (0.0=strict, 0.1=10%% non-blank allowed, default: 0.0)",
)
parser.add_argument(
"--no-split-content",
action="store_true",
help="Prevents content blocks from being split. Will error if a block is too tall to fit on a single page.",
)
# Add page numbering arguments
parser.add_argument(
"--page-numbers",
action="store_true",
default=True,
help="Add page numbers (default)",
)
parser.add_argument(
"--no-page-numbers",
action="store_false",
dest="page_numbers",
help="Disable page numbers",
)
parser.add_argument(
"--number-position",
choices=["bottom-left", "bottom-right", "top-left", "top-right"],
default="bottom-left",
help="Position of page numbers",
)
parser.add_argument("--number-font", default="Helvetica", help="Font for page numbers")
parser.add_argument(
"--number-size",
type=int,
default=10,
help="Font size for page numbers in points",
)
parser.add_argument(
"--skip-first-number",
action="store_true",
default=True,
help="Skip page number on first page",
)
# Add title arguments
parser.add_argument(
"--title",
help='Add title to first page. Use "from-filename" to use input filename',
)
parser.add_argument(
"--title-position",
choices=["left", "center", "right"],
default="center",
help="Position of title",
)
parser.add_argument("--title-font", default="Helvetica-Bold", help="Font for title")
parser.add_argument("--title-size", type=int, default=14, help="Font size for title in points")
# Add page range option
parser.add_argument("--page-range", help="Page range to output (e.g., 5, 5-10)")
# Add column arguments
parser.add_argument(
"--columns",
"-c",
type=int,
help="Number of columns per page (default: auto-calculated)",
)
parser.add_argument(
"--column-gap",
type=float,
default=20.0,
help="Gap between columns in points (default: 20.0)",
)
# Add debug flag
parser.add_argument("--debug", action="store_true", help="Show detailed debug information")
# Add OCR arguments
parser.add_argument("--ocr", action="store_true", help="Enable OCR text layer (requires tesseract)")
parser.add_argument("--ocr-lang", default="eng", help="OCR language (default: eng)")
parser.add_argument("--no-ocr", action="store_false", dest="ocr", help="Disable OCR (default)")
args = parser.parse_args()
# Set output filename if not specified
if not args.output:
base_name = os.path.splitext(args.input_file)[0]
args.output = f"{base_name}.pdf"
# Convert margin to pixels
margin_points = parse_margin(args.margin)
# Process title if specified
title: str | None = None
if args.title:
title = title_from_filename(args.input_file) if args.title == "from-filename" else args.title
try:
print(f"Opening image: {args.input_file}")
# Open and trim image
with Image.open(args.input_file) as img:
print("Trimming whitespace from image edges...")
trimmed_img = trim_whitespace(img)
# Create PDF
create_pdf(
trimmed_img,
args.output,
PAGE_SIZES[args.page_size.lower()],
margin_points,
args.min_gap,
blank_ratio=args.blank_ratio,
columns=args.columns,
column_gap=args.column_gap,
add_page_numbers=args.page_numbers,
number_position=args.number_position,
number_font=args.number_font,
number_size=args.number_size,
skip_first_number=args.skip_first_number,
title=title,
title_position=args.title_position,
title_font=args.title_font,
title_size=args.title_size,
page_range=args.page_range,
enable_ocr=args.ocr,
ocr_lang=args.ocr_lang,
debug=args.debug,
no_split_content=args.no_split_content,
)
print(f"Successfully created PDF: {args.output}")
except Exception as e:
print(f"Error: {e!s}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()