diff --git a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java index 2e8d138c2..fdae71173 100644 --- a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java +++ b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java @@ -329,8 +329,9 @@ private static void generateOutputs(String inputPdfName, List> con imagesDirectory = config.getOutputFolder() + File.separator + baseName + MarkdownSyntax.IMAGES_DIRECTORY_SUFFIX; } StaticLayoutContainers.setImagesDirectory(imagesDirectory); - ImagesUtils imagesUtils = new ImagesUtils(); - imagesUtils.write(contents, inputPdfName, config.getPassword()); + try (ImagesUtils imagesUtils = new ImagesUtils()) { + imagesUtils.write(contents, inputPdfName, config.getPassword()); + } } if (config.isGeneratePDF()) { PDFWriter pdfWriter = new PDFWriter(); diff --git a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java index edb21f51e..79377fd6c 100644 --- a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java +++ b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java @@ -15,6 +15,12 @@ */ package org.opendataloader.pdf.utils; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.graphics.image.PDImage; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.opendataloader.pdf.containers.StaticLayoutContainers; import org.opendataloader.pdf.entities.SemanticPicture; import org.opendataloader.pdf.markdown.MarkdownSyntax; @@ -27,24 +33,28 @@ import org.verapdf.wcag.algorithms.entities.tables.tableBorders.TableBorder; import org.verapdf.wcag.algorithms.entities.tables.tableBorders.TableBorderCell; import org.verapdf.wcag.algorithms.entities.tables.tableBorders.TableBorderRow; -import org.verapdf.wcag.algorithms.semanticalgorithms.consumers.ContrastRatioConsumer; import org.verapdf.wcag.algorithms.semanticalgorithms.containers.StaticContainers; import javax.imageio.ImageIO; +import java.awt.geom.Point2D; import java.awt.image.BufferedImage; import java.io.File; import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.Objects; import java.util.logging.Level; import java.util.logging.Logger; -public class ImagesUtils { +public class ImagesUtils implements AutoCloseable { private static final Logger LOGGER = Logger.getLogger(ImagesUtils.class.getCanonicalName()); - private ContrastRatioConsumer contrastRatioConsumer; - public ContrastRatioConsumer getContrastRatioConsumer() { - return contrastRatioConsumer; - } + private org.apache.pdfbox.pdmodel.PDDocument pdfBoxDocument; + private final Map> pageImagesByNumber = new HashMap<>(); + private String loadedPdfFilePath; + private String loadedPassword; public void createImagesDirectory(String path) { File directory = new File(path); @@ -55,8 +65,12 @@ public void createImagesDirectory(String path) { public void write(List> contents, String pdfFilePath, String password) { for (int pageNumber = 0; pageNumber < StaticContainers.getDocument().getNumberOfPages(); pageNumber++) { - for (IObject content : contents.get(pageNumber)) { - writeFromContents(content, pdfFilePath, password); + try { + for (IObject content : contents.get(pageNumber)) { + writeFromContents(content, pdfFilePath, password); + } + } finally { + releasePageImages(pageNumber); } } } @@ -95,29 +109,27 @@ protected void writeImage(ImageChunk chunk, String pdfFilePath, String password) int currentImageIndex = StaticLayoutContainers.incrementImageIndex(); if (currentImageIndex == 1) { createImagesDirectory(StaticLayoutContainers.getImagesDirectory()); - contrastRatioConsumer = StaticLayoutContainers.getContrastRatioConsumer(pdfFilePath, password, false, null); } String imageFormat = StaticLayoutContainers.getImageFormat(); String fileName = String.format(MarkdownSyntax.IMAGE_FILE_NAME_FORMAT, StaticLayoutContainers.getImagesDirectory(), File.separator, currentImageIndex, imageFormat); chunk.setIndex(currentImageIndex); - createImageFile(chunk.getBoundingBox(), fileName, imageFormat); + createImageFile(chunk.getBoundingBox(), fileName, imageFormat, pdfFilePath, password); } protected void writePicture(SemanticPicture picture, String pdfFilePath, String password) { int pictureIndex = picture.getPictureIndex(); - if (contrastRatioConsumer == null) { + if (pictureIndex == 1) { createImagesDirectory(StaticLayoutContainers.getImagesDirectory()); - contrastRatioConsumer = StaticLayoutContainers.getContrastRatioConsumer(pdfFilePath, password, false, null); } String imageFormat = StaticLayoutContainers.getImageFormat(); String fileName = String.format(MarkdownSyntax.IMAGE_FILE_NAME_FORMAT, StaticLayoutContainers.getImagesDirectory(), File.separator, pictureIndex, imageFormat); - createImageFile(picture.getBoundingBox(), fileName, imageFormat); + createImageFile(picture.getBoundingBox(), fileName, imageFormat, pdfFilePath, password); } - private void createImageFile(BoundingBox imageBox, String fileName, String imageFormat) { + private void createImageFile(BoundingBox imageBox, String fileName, String imageFormat, String pdfFilePath, String password) { try { File outputFile = new File(fileName); - BufferedImage targetImage = contrastRatioConsumer != null ? contrastRatioConsumer.getPageSubImage(imageBox) : null; + BufferedImage targetImage = extractPageImage(imageBox, pdfFilePath, password); if (targetImage == null) { return; } @@ -127,8 +139,267 @@ private void createImageFile(BoundingBox imageBox, String fileName, String image } } + protected BufferedImage extractPageImage(BoundingBox imageBox, String pdfFilePath, String password) throws IOException { + if (imageBox == null || imageBox.getPageNumber() == null) { + return null; + } + + ensurePdfDocument(pdfFilePath, password); + int pageIndex = imageBox.getPageNumber(); + if (pageIndex < 0 || pageIndex >= pdfBoxDocument.getNumberOfPages()) { + return null; + } + + PageImageReference pageImage = findBestPageImage(imageBox, pageIndex); + if (pageImage == null) { + return null; + } + + pageImage.markUsed(); + return pageImage.getImage(); + } + + private PageImageReference findBestPageImage(BoundingBox targetBoundingBox, int pageIndex) throws IOException { + List pageImages = getPageImages(pageIndex); + PageImageReference bestOverlapMatch = null; + double bestOverlapArea = 0.0; + double bestAreaDelta = Double.MAX_VALUE; + for (PageImageReference pageImage : pageImages) { + if (pageImage.isUsed()) { + continue; + } + double overlapArea = getIntersectionArea(targetBoundingBox, pageImage.getBoundingBox()); + if (overlapArea <= 0.0) { + continue; + } + double areaDelta = getAreaDeltaRatio(targetBoundingBox, pageImage.getBoundingBox()); + if (bestOverlapMatch == null || overlapArea > bestOverlapArea + || (Double.compare(overlapArea, bestOverlapArea) == 0 && areaDelta < bestAreaDelta)) { + bestOverlapMatch = pageImage; + bestOverlapArea = overlapArea; + bestAreaDelta = areaDelta; + } + } + if (bestOverlapMatch != null) { + return bestOverlapMatch; + } + return null; + } + + private List getPageImages(int pageIndex) throws IOException { + List pageImages = pageImagesByNumber.get(pageIndex); + if (pageImages != null) { + return pageImages; + } + + List extractedImages = new ArrayList<>(); + PDPage page = pdfBoxDocument.getPage(pageIndex); + new PageImageCollector(page, pageIndex, extractedImages).processPage(page); + pageImagesByNumber.put(pageIndex, extractedImages); + return extractedImages; + } + + private double getIntersectionArea(BoundingBox first, BoundingBox second) { + double left = Math.max(first.getLeftX(), second.getLeftX()); + double right = Math.min(first.getRightX(), second.getRightX()); + double bottom = Math.max(first.getBottomY(), second.getBottomY()); + double top = Math.min(first.getTopY(), second.getTopY()); + if (right <= left || top <= bottom) { + return 0.0; + } + return (right - left) * (top - bottom); + } + + private double getAreaDeltaRatio(BoundingBox first, BoundingBox second) { + double firstArea = getBoundingBoxArea(first); + double secondArea = getBoundingBoxArea(second); + double largerArea = Math.max(1.0, Math.max(firstArea, secondArea)); + return Math.abs(firstArea - secondArea) / largerArea; + } + + private double getBoundingBoxArea(BoundingBox boundingBox) { + return Math.max(0.0, boundingBox.getWidth()) * Math.max(0.0, boundingBox.getHeight()); + } + + private void ensurePdfDocument(String pdfFilePath, String password) throws IOException { + if (pdfBoxDocument != null) { + if (Objects.equals(loadedPdfFilePath, pdfFilePath) && Objects.equals(loadedPassword, password)) { + return; + } + releaseAllPageImages(); + closePdfDocument(); + } + File pdfFile = new File(pdfFilePath); + pdfBoxDocument = password != null && !password.isEmpty() + ? Loader.loadPDF(pdfFile, password) + : Loader.loadPDF(pdfFile); + loadedPdfFilePath = pdfFilePath; + loadedPassword = password; + } + + private void releasePageImages(int pageIndex) { + List pageImages = pageImagesByNumber.remove(pageIndex); + if (pageImages == null) { + return; + } + for (PageImageReference pageImage : pageImages) { + pageImage.release(); + } + } + + private void releaseAllPageImages() { + for (List pageImages : pageImagesByNumber.values()) { + for (PageImageReference pageImage : pageImages) { + pageImage.release(); + } + } + pageImagesByNumber.clear(); + } + + private void closePdfDocument() throws IOException { + if (pdfBoxDocument == null) { + return; + } + try { + pdfBoxDocument.close(); + } finally { + pdfBoxDocument = null; + loadedPdfFilePath = null; + loadedPassword = null; + } + } + public static boolean isImageFileExists(String fileName) { File outputFile = new File(fileName); return outputFile.exists(); } + + @Override + public void close() throws IOException { + releaseAllPageImages(); + if (pdfBoxDocument != null) { + closePdfDocument(); + } + } + + private static final class PageImageReference { + private final PDImage image; + private final BoundingBox boundingBox; + private BufferedImage bufferedImage; + private boolean used; + + private PageImageReference(PDImage image, BoundingBox boundingBox) { + this.image = image; + this.boundingBox = boundingBox; + } + + private BoundingBox getBoundingBox() { + return boundingBox; + } + + private boolean isUsed() { + return used; + } + + private void markUsed() { + used = true; + } + + private BufferedImage getImage() throws IOException { + if (bufferedImage == null) { + if (image instanceof PDImageXObject) { + bufferedImage = ((PDImageXObject) image).getOpaqueImage(); + } else { + bufferedImage = image.getImage(); + } + } + return bufferedImage; + } + + private void release() { + if (bufferedImage != null) { + bufferedImage.flush(); + bufferedImage = null; + } + } + } + + private static final class PageImageCollector extends PDFGraphicsStreamEngine { + private final int pageIndex; + private final List pageImages; + private Point2D currentPoint = new Point2D.Float(); + + private PageImageCollector(PDPage page, int pageIndex, List pageImages) { + super(page); + this.pageIndex = pageIndex; + this.pageImages = pageImages; + } + + @Override + public void drawImage(PDImage pdImage) { + Point2D lowerLeft = transformedPoint(0, 0); + Point2D lowerRight = transformedPoint(1, 0); + Point2D upperLeft = transformedPoint(0, 1); + Point2D upperRight = transformedPoint(1, 1); + + double left = Math.min(Math.min(lowerLeft.getX(), lowerRight.getX()), Math.min(upperLeft.getX(), upperRight.getX())); + double right = Math.max(Math.max(lowerLeft.getX(), lowerRight.getX()), Math.max(upperLeft.getX(), upperRight.getX())); + double bottom = Math.min(Math.min(lowerLeft.getY(), lowerRight.getY()), Math.min(upperLeft.getY(), upperRight.getY())); + double top = Math.max(Math.max(lowerLeft.getY(), lowerRight.getY()), Math.max(upperLeft.getY(), upperRight.getY())); + + pageImages.add(new PageImageReference(pdImage, new BoundingBox(pageIndex, left, bottom, right, top))); + } + + @Override + public void appendRectangle(Point2D point0, Point2D point1, Point2D point2, Point2D point3) { + } + + @Override + public void clip(int windingRule) { + } + + @Override + public void moveTo(float x, float y) { + currentPoint = new Point2D.Float(x, y); + } + + @Override + public void lineTo(float x, float y) { + currentPoint = new Point2D.Float(x, y); + } + + @Override + public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) { + currentPoint = new Point2D.Float(x3, y3); + } + + @Override + public Point2D getCurrentPoint() { + return currentPoint; + } + + @Override + public void closePath() { + } + + @Override + public void endPath() { + } + + @Override + public void strokePath() { + } + + @Override + public void fillPath(int windingRule) { + } + + @Override + public void fillAndStrokePath(int windingRule) { + } + + @Override + public void shadingFill(COSName shadingName) { + } + } } diff --git a/java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/utils/ImagesUtilsTest.java b/java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/utils/ImagesUtilsTest.java index f45ea84b0..da2fd9294 100644 --- a/java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/utils/ImagesUtilsTest.java +++ b/java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/utils/ImagesUtilsTest.java @@ -15,11 +15,16 @@ */ package org.opendataloader.pdf.utils; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.junit.jupiter.api.Test; import org.opendataloader.pdf.containers.StaticLayoutContainers; import org.verapdf.wcag.algorithms.entities.content.ImageChunk; import org.verapdf.wcag.algorithms.entities.geometry.BoundingBox; +import javax.imageio.ImageIO; +import java.awt.image.BufferedImage; import java.io.File; import java.io.IOException; import java.nio.file.Files; @@ -65,31 +70,39 @@ void testCreateImagesDirectory() throws IOException { } @Test - void testWriteImageInitializesContrastRatioConsumer() throws IOException { + void testWriteImageExtractsImageObject() throws IOException { StaticLayoutContainers.clearContainers(); // Given Path tempDir = Files.createTempDirectory("htmlgen-test"); - File testPdf = new File("../../samples/pdf/lorem.pdf"); + File testPdf = new File("../../samples/pdf/chinese_scan.pdf"); String outputFolder = tempDir.toString(); // When try { - // Then - if ContrastRatioConsumer wasn't initialized, - // it would be null and cause NPE when used Path path = Paths.get(testPdf.getAbsolutePath()); - ImagesUtils imagesUtils = new ImagesUtils(); - assertNull(imagesUtils.getContrastRatioConsumer()); - StaticLayoutContainers.setImagesDirectory(outputFolder + File.separator + path.getFileName().toString().substring(0, path.getFileName().toString().length() - 4) + "_images"); - ImageChunk imageChunk = new ImageChunk(new BoundingBox(0)); - // Initializing contrastRatioConsumer in writeImage() - imagesUtils.writeImage(imageChunk, testPdf.getAbsolutePath(),""); - assertNotNull(imagesUtils.getContrastRatioConsumer()); - // Verify file was created - Path pngPath = Path.of(StaticLayoutContainers.getImagesDirectory(), "imageFile1.png"); - // PNG file is created - assertTrue(Files.exists(pngPath), "PNG file created successfully"); + try (ImagesUtils imagesUtils = new ImagesUtils()) { + StaticLayoutContainers.setImageFormat("png"); + StaticLayoutContainers.setImagesDirectory(outputFolder + File.separator + path.getFileName().toString().substring(0, path.getFileName().toString().length() - 4) + "_images"); + BoundingBox pageImageBoundingBox; + try (PDDocument pdfDocument = Loader.loadPDF(testPdf)) { + PDRectangle cropBox = pdfDocument.getPage(0).getCropBox(); + pageImageBoundingBox = new BoundingBox( + 0, + cropBox.getLowerLeftX(), + cropBox.getLowerLeftY(), + cropBox.getUpperRightX(), + cropBox.getUpperRightY() + ); + } + ImageChunk imageChunk = new ImageChunk(pageImageBoundingBox); + imagesUtils.writeImage(imageChunk, testPdf.getAbsolutePath(),""); + + Path pngPath = Path.of(StaticLayoutContainers.getImagesDirectory(), "imageFile1.png"); + assertTrue(Files.exists(pngPath), "PNG file created successfully"); + BufferedImage image = ImageIO.read(pngPath.toFile()); + assertNotNull(image, "PNG file should be readable"); + assertTrue(image.getWidth() > 0 && image.getHeight() > 0, "PNG file should contain pixels"); + } } finally { - // Cleanup - StaticLayoutContainers.closeContrastRatioConsumer(); Files.walk(tempDir) .sorted((a, b) -> b.compareTo(a)) .forEach(p -> {