osteele · uho-33 · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025 · Sep 22, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,13 @@
 
 All notable changes to this project will be documented in this file.
 
+## [0.1.2]
+
+### Added
+- Added `--no-split-content` flag to prevent content blocks from being split across pages
+- Added `--blank-ratio` argument to control strictness of blank line detection
+
+
 ## [0.1.1]
 
 ### Added

diff --git a/README.md b/README.md
@@ -130,6 +130,8 @@ scrollshot2pdf input.png \
 - `--columns`, `-c`: Number of columns per page (default: 1)
 - `--column-gap`: Gap between columns in points (default: 20.0)
 - `--min-gap`, `-g`: Minimum gap size in pixels for page breaks (default: 50)
+- `--blank-ratio`, `-b`: Ratio of non-blank to blank pixels allowed in blank lines (default: 0.0)
+- `--no-split-content`: Prevents content blocks from being split across pages. Will error if a block is too tall for one page.
 
 ### Page Numbers
 - `--page-numbers`: Add page numbers (default: enabled)
@@ -213,17 +215,56 @@ Margins can be specified in:
 
 1. The script first trims any whitespace borders from the input image
 2. It scales the image to fit the page width while maintaining aspect ratio
-3. It analyzes the image to find vertical gaps in content
+3. It analyzes the image to find vertical gaps in content using configurable blank line detection
 4. It calculates optimal slice positions based on page height and content gaps
 5. Finally, it creates a PDF with one slice per page, adding specified margins
 
-## Example
+### Blank Line Detection
+
+The tool detects content gaps by analyzing each horizontal row of pixels. By default, a row is considered "blank" only if all pixels are nearly white (> 250 on a 0-255 scale). This strict detection can be relaxed using the `--blank-ratio` option:
+
+- `--blank-ratio 0.0` (default): Strict mode - all pixels must be nearly white
+- `--blank-ratio 0.1`: Allow up to 10% of pixels to be non-blank and still consider the row "blank"
+- `--blank-ratio 0.2`: Allow up to 20% of pixels to be non-blank, and so on
+
+This is particularly useful for images with:
+- Slightly noisy or imperfect blank areas
+- Compression artifacts in white space
+- Subtle background patterns or textures
+
+### Content Splitting Control
+
+The tool's main goal is to slice the image at ideal page breaks. By default, it prioritizes splitting at natural content gaps to avoid awkward breaks. The `--no-split-content` flag changes the fallback behavior when a content block is too long to fit on one page.
+
+- **Default Behavior:** The tool first looks for a content gap within the page's height. If no gap is found, it will split the content directly at the page boundary, which can cut through text or images.
+- **With `--no-split-content`:** The tool also looks for a content gap within the page's height. However, if no gap is found, it will **not** split the content. Instead, it extends the slice downwards until it finds the next available gap.
+
+This is useful for ensuring that logical blocks of content (like paragraphs, code blocks, or images) are never cut in half.
+
+**Important**: Because this can create slices taller than the specified page, the tool will exit with an error if a resulting slice cannot fit on the page. To resolve this, you can:
+
+1.  Remove the `--no-split-content` flag to allow content splitting.
+2.  Use a larger page size (e.g., `--page-size legal` or `--page-size a3-landscape`).
+3.  Use a smaller `--min-gap` value to help the tool detect more potential split points in your image.
+4. If your image's blank areas have noise, try `--blank-ratio` to allow for imperfectly blank lines.
+
+## Examples
 
 Converting a tall screenshot into a 3-page PDF with 25mm margins:
 ```bash
 scrollshot2pdf screenshot.png --margin 25mm
 ```
 
+Handling images with noisy blank areas by allowing 15% non-blank pixels in "blank" lines:
+```bash
+scrollshot2pdf noisy_image.png --blank-ratio 0.15
+```
+
+Preventing content from being split across pages (may require a larger page size):
+```bash
+scrollshot2pdf long_content.png --no-split-content --page-size legal
+```
+
 ## Dependencies
 
 - Python 3

diff --git a/scrollshot2pdf.py b/scrollshot2pdf.py
@@ -65,10 +65,16 @@ def trim_whitespace(image: Image.Image) -> Image.Image:
     return image
 
 
-def find_content_gaps(image: Image.Image, min_gap_size: int = 50) -> list[int]:
+def find_content_gaps(image: Image.Image, min_gap_size: int = 50, blank_ratio: float = 0.0) -> list[int]:
     """
     Find vertical positions where there are gaps in content.
     Returns a list of y-coordinates where gaps occur.
+
+    Args:
+        image: Image to analyze
+        min_gap_size: Minimum gap size in pixels to consider for page breaks
+        blank_ratio: Ratio of non-blank to blank pixels allowed in blank lines 
+                    (0.0=strict, 0.1=10% non-blank allowed)
     """
     # Convert to grayscale for analysis
     gray = image.convert("L")
@@ -86,8 +92,15 @@ def find_content_gaps(image: Image.Image, min_gap_size: int = 50) -> list[int]:
         row_end = row_start + width
         row = pixels[row_start:row_end]
 
-        # Check if row is empty (all white or nearly white pixels)
-        is_empty = all(p > 250 for p in row)
+        # Check if row is empty based on blank_ratio
+        if blank_ratio == 0.0:
+            # Strict mode: all pixels must be > 250 (nearly white)
+            is_empty = all(p > 250 for p in row)
+        else:
+            # Ratio mode: allow some non-blank pixels
+            non_blank_pixels = sum(1 for p in row if p <= 250)
+            non_blank_ratio = non_blank_pixels / len(row)
+            is_empty = non_blank_ratio <= blank_ratio
 
         if is_empty and current_gap_start is None:
             current_gap_start = y
@@ -100,7 +113,9 @@ def find_content_gaps(image: Image.Image, min_gap_size: int = 50) -> list[int]:
     return gaps
 
 
-def calculate_slices(image_height: int, page_height: int, content_gaps: list[int]) -> list[tuple[int, int]]:
+def calculate_slices(
+    image_height: int, page_height: int, content_gaps: list[int], no_split_content: bool = False
+) -> list[tuple[int, int]]:
     """
     Calculate optimal slice positions based on page height and content gaps.
     Returns list of (start_y, end_y) tuples.
@@ -117,19 +132,30 @@ def calculate_slices(image_height: int, page_height: int, content_gaps: list[int
             slices.append((current_pos, image_height))
             break
 
-        # Find nearest content gap
-        nearest_gap = None
-        min_distance = page_height // 4  # Don't look for gaps too far from ideal position
-
+        # Find the last content gap before the ideal split point.
+        best_gap = None
         for gap in content_gaps:
-            if gap > current_pos and gap < ideal_next_pos:
-                distance = abs(gap - ideal_next_pos)
-                if distance < min_distance:
-                    nearest_gap = gap
-                    min_distance = distance
-
-        # Use gap position if found, otherwise use ideal position
-        next_pos = nearest_gap if nearest_gap is not None else ideal_next_pos
+            if current_pos < gap < ideal_next_pos:
+                best_gap = gap
+
+        next_pos = ideal_next_pos
+        if best_gap is not None:
+            # If a gap was found within the page, use it to avoid splitting content.
+            next_pos = best_gap
+        elif no_split_content:
+            # If in strict no-split mode and no gap was found, find the very next
+            # available gap in the image, even if it makes a very long page.
+            next_available_gap = None
+            for gap in content_gaps:
+                if gap > current_pos:
+                    next_available_gap = gap
+                    break
+            if next_available_gap is not None:
+                next_pos = next_available_gap
+            else:
+                # No more gaps in the entire image, so this slice must go to the end.
+                next_pos = image_height
+
         slices.append((current_pos, next_pos))
         current_pos = next_pos
 
@@ -301,6 +327,7 @@ def create_pdf(
     margin_points: float,
     min_gap_size: int = 50,
     *,
+    blank_ratio: float = 0.0,
     columns: int | None = None,
     column_gap: float = 20.0,
     add_page_numbers: bool = True,
@@ -316,6 +343,7 @@ def create_pdf(
     enable_ocr: bool = False,
     ocr_lang: str = "eng",
     debug: bool = False,
+    no_split_content: bool = False,
 ) -> None:
     """Create PDF from image with optional OCR layer."""
 
@@ -376,11 +404,11 @@ def create_pdf(
 
     # Find content gaps in original image (no scaling)
     print("Finding content gaps for optimal page breaks...")
-    content_gaps = find_content_gaps(image, min_gap_size)
+    content_gaps = find_content_gaps(image, min_gap_size, blank_ratio)
 
     # Calculate slice positions using scaled height
     scaled_usable_height = int(usable_height / scale_factor)
-    slices = calculate_slices(image.size[1], scaled_usable_height, content_gaps)
+    slices = calculate_slices(image.size[1], scaled_usable_height, content_gaps, no_split_content)
 
     # Calculate total pages needed based on columns
     total_slices = len(slices)
@@ -461,6 +489,33 @@ def add_title(canvas):
 
             start_y, end_y = slices[i + col]
             slice_height = end_y - start_y
+            scaled_slice_height = slice_height * scale_factor
+
+            # Safety check for --no-split-content mode
+            if no_split_content and scaled_slice_height > usable_height:
+                print(
+                    "Error: A content block is too tall to fit on a single page, and --no-split-content is enabled.",
+                    file=sys.stderr,
+                )
+                print("To resolve this, you can:", file=sys.stderr)
+                print(
+                    "  1. Remove the --no-split-content flag to allow the content to be split across pages.",
+                    file=sys.stderr,
+                )
+                print(
+                    "  2. Increase the page height by selecting a larger --page-size (e.g., 'legal' or 'a3-landscape').",
+                    file=sys.stderr,
+                )
+                print(
+                    "  3. If your image has small whitespace breaks, try a smaller --min-gap value to detect them as split points.",
+                    file=sys.stderr,
+                )
+                print(
+                    "  4. If your image's blank areas have noise, try --blank-ratio to allow for imperfectly blank lines.",
+                    file=sys.stderr,
+                )
+                sys.exit(1)
+
             slice_img = image.crop((0, start_y, image.size[0], end_y))
 
             # Save temporary slice
@@ -527,6 +582,18 @@ def main():
         default=50,
         help="Minimum gap size in pixels to consider for page breaks (default: 50)",
     )
+    parser.add_argument(
+        "--blank-ratio",
+        "-b",
+        type=float,
+        default=0.0,
+        help="Ratio of non-blank to blank pixels allowed in blank lines (0.0=strict, 0.1=10%% non-blank allowed, default: 0.0)",
+    )
+    parser.add_argument(
+        "--no-split-content",
+        action="store_true",
+        help="Prevents content blocks from being split. Will error if a block is too tall to fit on a single page.",
+    )
 
     # Add page numbering arguments
     parser.add_argument(
@@ -629,6 +696,7 @@ def main():
                 PAGE_SIZES[args.page_size.lower()],
                 margin_points,
                 args.min_gap,
+                blank_ratio=args.blank_ratio,
                 columns=args.columns,
                 column_gap=args.column_gap,
                 add_page_numbers=args.page_numbers,
@@ -644,6 +712,7 @@ def main():
                 enable_ocr=args.ocr,
                 ocr_lang=args.ocr_lang,
                 debug=args.debug,
+                no_split_content=args.no_split_content,
             )
 
         print(f"Successfully created PDF: {args.output}")