update caj2pdf

qinlili23333 · Dec 20, 2022 · dfbcb7c · dfbcb7c
1 parent 68352db
commit dfbcb7c
Show file tree

Hide file tree

Showing 7 changed files with 168 additions and 34 deletions.
diff --git a/printmodule/caj/HNParsePage.py b/printmodule/caj/HNParsePage.py
@@ -15,6 +15,8 @@ def __init__(self, data, old_style=False):
         def Text(self, code):
             try:
                 self.characters.append(bytes([self.data[self.offset+5],self.data[self.offset+4]]).decode("gbk"))
+            except IndexError: # short data, nothing to do
+                pass
             except UnicodeDecodeError:
                 # HTL: When cut-and-paste on Linux, these transform to GB18030,
                 # but I believe they are OCR artifacts. Where they occur,
@@ -44,12 +46,24 @@ def TextMulti(self, code):
             if (code == 0x8001):
                 self.characters.append("\n")
             while (1):
-                if (self.data[self.offset+1] == 0x80):
-                    break
-                self.characters.append(bytes([self.data[self.offset+3],self.data[self.offset+2]]).decode("gbk"))
+                try:
+                    if (self.data[self.offset+1] == 0x80):
+                        break
+                except IndexError: # short data, nothing to do
+                    return
+                try:
+                    self.characters.append(bytes([self.data[self.offset+3],self.data[self.offset+2]]).decode("gbk"))
+                except UnicodeDecodeError:
+                    self.characters.append("<0x%04X>\n" % (self.data[self.offset+3] * 256 + self.data[self.offset+2]))
+                except IndexError: # short data, nothing to do
+                    return
                 self.offset += 4
 
         def Figure(self, code):
+            try:
+                self.data[self.offset+25]
+            except IndexError: # short data, nothing to do
+                return
             (ignore1, offset_x, offset_y, size_x, size_y, int2, int3, int4, int5)= struct.unpack("<HHHHHIIII", self.data[self.offset:self.offset+26])
             # in units of 1/2.473 pixels
             self.figures.append([offset_x, offset_y, size_x, size_y])

diff --git a/printmodule/caj/__pycache__/cajparser.cpython-310.pyc b/printmodule/caj/__pycache__/cajparser.cpython-310.pyc
diff --git a/printmodule/caj/__pycache__/utils.cpython-310.pyc b/printmodule/caj/__pycache__/utils.cpython-310.pyc
diff --git a/printmodule/caj/cajparser.py b/printmodule/caj/cajparser.py
@@ -2,7 +2,12 @@
 import struct
 from shutil import copy
 from subprocess import check_output, STDOUT, CalledProcessError
-from utils import fnd, fnd_all, add_outlines, fnd_rvrs, fnd_unuse_no
+from utils import fnd, fnd_all, add_outlines, fnd_rvrs, fnd_unuse_no, find_redundant_images
+
+try:
+    from PyPDF2 import errors
+except ImportError:
+    from PyPDF2 import utils as errors
 
 KDH_PASSPHRASE = b"FZHMEI"
 
@@ -287,10 +292,23 @@ def _convert_caj(self, dest):
             check_output(["mutool", "clean", "pdf.tmp", "pdf_toc.pdf"], stderr=STDOUT)
         except CalledProcessError as e:
             print(e.output.decode("utf-8"))
-            raise SystemExit("Command mutool returned non-zero exit status " + str(e.returncode))
+            print("Command mutool returned non-zero exit status " + str(e.returncode))
+            print("Try merge mode...")
+            os.remove("pdf_toc.pdf")
+            try:
+                 check_output(["mutool", "merge", "-opdf_toc.pdf", "pdf.tmp"], stderr=STDOUT)
+            except CalledProcessError as e:
+                    print(e.output.decode("utf-8"))
+                    SystemExit("Merge mode also failed.")
+
 
         # Add Outlines
-        add_outlines(self.get_toc(), "pdf_toc.pdf", dest)
+        try:
+            add_outlines(self.get_toc(), "pdf_toc.pdf", dest)
+        except errors.PdfReadError as e:
+            print("errors.PdfReadError:", str(e))
+            copy("pdf_toc.pdf", dest)
+            pass
         os.remove("pdf.tmp")
         os.remove("pdf_toc.pdf")
 
@@ -306,11 +324,14 @@ def _convert_hn(self, dest):
             [page_data_offset, size_of_text_section, images_per_page, page_no, unk2, next_page_data_offset] = struct.unpack("iihhii", caj.read(20))
             caj.seek(page_data_offset)
             text_header_read32 = caj.read(32)
-            if (text_header_read32[8:20] == b'COMPRESSTEXT'):
-                [expanded_text_size] = struct.unpack("i", text_header_read32[20:24])
+            if ((text_header_read32[8:20] == b'COMPRESSTEXT') or (text_header_read32[0:12] == b'COMPRESSTEXT')):
+                coff = 8
+                if (text_header_read32[0:12] == b'COMPRESSTEXT'):
+                    coff = 0
+                [expanded_text_size] = struct.unpack("i", text_header_read32[12+coff:16+coff])
                 import zlib
-                caj.seek(page_data_offset + 24)
-                data = caj.read(size_of_text_section - 24)
+                caj.seek(page_data_offset + 16 + coff)
+                data = caj.read(size_of_text_section - 16 - coff)
                 output = zlib.decompress(data, bufsize=expanded_text_size)
                 if (len(output) != expanded_text_size):
                     raise SystemExit("Unexpected:", len(output), expanded_text_size)
@@ -321,13 +342,35 @@ def _convert_hn(self, dest):
             page_style = (next_page_data_offset > page_data_offset)
             page_data = HNParsePage(output, page_style)
 
+            current_offset = page_data_offset + size_of_text_section
+            (found, images_per_page) = find_redundant_images(caj, current_offset, images_per_page)
+            if (found):
+                print("Page %d, skipping %d redundant images" % (i+1, images_per_page * ( images_per_page - 1)))
+
             if (images_per_page > 1):
                 if (len(page_data.figures) == images_per_page):
-                    image_list.append(None)
-                    image_list.append(page_data.figures)
+                    if (page_data.figures[0][0] == 0) and (page_data.figures[0][1] == 0):
+                        image_list.append(None)
+                        image_list.append(page_data.figures)
+                    else:
+                        print("Page %d, Image Count %d, first image not at origin, expanding to %d pages"
+                              % (i+1, len(page_data.figures), images_per_page))
                 else:
-                    raise SystemExit("Image Count %d != %d" % (len(page_data.figures), images_per_page))
-            current_offset = page_data_offset + size_of_text_section
+                    print("Page %d, Image Count %d != %d" % (i+1, len(page_data.figures), images_per_page))
+                    if (len(page_data.figures) > images_per_page):
+                        print("\tTruncating Page %d," % (i+1), page_data.figures)
+                        image_list.append(None)
+                        image_list.append(page_data.figures[0:images_per_page])
+                    else:
+                        print("Page %d expanding to %d separate image pages" % (i+1, images_per_page))
+            elif (images_per_page == 1):
+                if ((len(page_data.figures) == 0) or
+                    ((len(page_data.figures) > 0) and
+                    (not ((page_data.figures[0][0] == 0) and (page_data.figures[0][1] == 0))))):
+                    print("Page %d possibly text-only + single figure(%d)" % (i+1, len(page_data.figures)))
+            else:
+                # don't care about images_per_page == 0
+                pass
             for j in range(images_per_page):
                 caj.seek(current_offset)
                 read32 = caj.read(32)
@@ -380,12 +423,31 @@ def _convert_hn(self, dest):
                         0
                     )
                 elif (image_type[image_type_enum] == "JPEG"):
-                    (height, width) = struct.unpack(">HH", image_data[163:167])
+                    colorspace = Colorspace.RGB
+                    component = 3
+                    # stock libjpeg location
+                    (SOFn, frame_length, bits_per_pixel, height, width, component) = struct.unpack(">HHBHHB", image_data[158:168])
+                    if (SOFn != 0xFFC0):
+                        # "Intel(R) JPEG Library" location
+                        (SOFn, frame_length, bits_per_pixel, height, width, component) = struct.unpack(">HHBHHB", image_data[0x272:0x27c])
+                        if (SOFn != 0xFFC0):
+                            # neither works, try brute-force
+                            import imagesize
+                            from PIL import Image as pilimage
+                            with open(".tmp.jpg", "wb") as f:
+                                f.write(image_data)
+                                (width, height) = imagesize.get(".tmp.jpg")
+                                pim = pilimage.open(".tmp.jpg")
+                                if (pim.mode == 'L'):
+                                    component = 1
+                            os.remove(".tmp.jpg")
                     if (image_type_enum == 1):
                         # non-inverted JPEG Images
                         height = -height
+                    if (component == 1):
+                        colorspace = Colorspace.L
                     image_item = (
-                        Colorspace.RGB,
+                        colorspace,
                         (300, 300),
                         ImageFormat.JPEG,
                         image_data,
@@ -418,11 +480,14 @@ def _text_extract_hn(self):
             [page_data_offset, size_of_text_section, images_per_page, page_no, unk2, next_page_data_offset] = struct.unpack("iihhii", caj.read(20))
             caj.seek(page_data_offset)
             text_header_read32 = caj.read(32)
-            if (text_header_read32[8:20] == b'COMPRESSTEXT'):
-                [expanded_text_size] = struct.unpack("i", text_header_read32[20:24])
+            if ((text_header_read32[8:20] == b'COMPRESSTEXT') or (text_header_read32[0:12] == b'COMPRESSTEXT')):
+                coff = 8
+                if (text_header_read32[0:12] == b'COMPRESSTEXT'):
+                    coff = 0
+                [expanded_text_size] = struct.unpack("i", text_header_read32[12+coff:16+coff])
                 import zlib
-                caj.seek(page_data_offset + 24)
-                data = caj.read(size_of_text_section - 24)
+                caj.seek(page_data_offset + 16 + coff)
+                data = caj.read(size_of_text_section - 16 - coff)
                 output = zlib.decompress(data, bufsize=expanded_text_size)
                 if (len(output) != expanded_text_size):
                     raise SystemExit("Unexpected:", len(output), expanded_text_size)
@@ -454,12 +519,15 @@ def _parse_hn(self):
             # The first 8 bytes are always: 03 80 XX 16 03 80 XX XX,
             # the last one 20 or 21, but the first two can be any.
             # 48/71 has: 03 80 E0 16 03 80 F7 20, the rest uniq
-            if (text_header_read32[8:20] == b'COMPRESSTEXT'):
+            if ((text_header_read32[8:20] == b'COMPRESSTEXT') or (text_header_read32[0:12] == b'COMPRESSTEXT')):
+                coff = 8
+                if (text_header_read32[0:12] == b'COMPRESSTEXT'):
+                    coff = 0
                 # expanded_text_size seems to be always about 2-3 times size_of_text_section, so this is a guess.
-                [expanded_text_size] = struct.unpack("i", text_header_read32[20:24])
+                [expanded_text_size] = struct.unpack("i", text_header_read32[12+coff:16+coff])
                 import zlib
-                caj.seek(page_data_offset + 24)
-                data = caj.read(size_of_text_section - 24)
+                caj.seek(page_data_offset + 16 + coff)
+                data = caj.read(size_of_text_section - 16 - coff)
                 output = zlib.decompress(data, bufsize=expanded_text_size)
                 if (len(output) != expanded_text_size):
                     print("Unexpected:", len(output), expanded_text_size)

diff --git a/printmodule/caj/jbig2dec.py b/printmodule/caj/jbig2dec.py
@@ -62,7 +62,7 @@ def DecodeJbig2(self):
     # PBM is only padded to 8 rather than 32.
     # If the padding is larger, write padded file.
     if (cimage.bytes_per_line > ((cimage.width +7) >> 3)):
-        cimage.width = bytes_per_line << 3
+        cimage.width = cimage.bytes_per_line << 3
 
     with open(sys.argv[2], "wb") as fout:
         fout.write("P4\n".encode("ascii"))

diff --git a/printmodule/caj/pdfwutils.py b/printmodule/caj/pdfwutils.py
@@ -1078,7 +1078,10 @@ def add_multi_imagepage(
                 image1[PdfName.Height] = -Im_i['imgheightpx']
             else:
                 image1[PdfName.Height] = Im_i['imgheightpx']
-            image1[PdfName.ColorSpace] = PdfName.DeviceRGB
+            if Im_i['color'] == Colorspace.L:
+                image1[PdfName.ColorSpace] = PdfName.DeviceGray
+            else:
+                image1[PdfName.ColorSpace] = PdfName.DeviceRGB
             image1[PdfName.BitsPerComponent] = Im_i['depth']
 
             offset_x = coordinates[i][0] / 300 * 72 / 2.473

diff --git a/printmodule/caj/utils.py b/printmodule/caj/utils.py
@@ -1,7 +1,12 @@
 import os
 import sys
-import PyPDF2.pdf as PDF
-from PyPDF2 import PdfFileWriter, PdfFileReader
+import struct
+import PyPDF2.generic as PDF
+try:
+    from PyPDF2 import PdfWriter, PdfReader
+except ImportError:
+    from PyPDF2 import PdfFileWriter as PdfWriter
+    from PyPDF2 import PdfFileReader as PdfReader
 
 
 class Node(object):
@@ -149,7 +154,10 @@ def fnd_unuse_no(nos1, nos2):
 
 def make_dest(pdfw, pg):
     d = PDF.ArrayObject()
-    d.append(pdfw.getPage(pg).indirectRef)
+    try:
+        d.append(pdfw.getPage(pg).indirect_ref)
+    except AttributeError:
+        d.append(pdfw.getPage(pg).indirectRef)
     d.append(PDF.NameObject("/XYZ"))
     d.append(PDF.NullObject())
     d.append(PDF.NullObject())
@@ -179,11 +187,14 @@ def build_outlines_btree(toc):
 
 def add_outlines(toc, filename, output):
     build_outlines_btree(toc)
-    pdf_out = PdfFileWriter()
+    pdf_out = PdfWriter()
     inputFile = open(filename, 'rb')
-    pdf_in = PdfFileReader(inputFile)
+    pdf_in = PdfReader(inputFile)
     for p in pdf_in.pages:
-        pdf_out.addPage(p)
+        try:
+            pdf_out.add_page(p)
+        except AttributeError:
+            pdf_out.addPage(p)
     toc_num = len(toc)
     if (toc_num == 0): # Just copy if toc empty
         outputFile = open(output, "wb")
@@ -217,13 +228,51 @@ def add_outlines(toc, filename, output):
                     PDF.NameObject(v): idorefs[n.index]
                 })
         olitems.append(oli)
-    pdf_out._addObject(ol)
+    try:
+        pdf_out._add_object(ol)
+    except AttributeError:
+        pdf_out._addObject(ol)
     for i in olitems:
-        pdf_out._addObject(i)
+        try:
+            pdf_out._add_object(i)
+        except AttributeError:
+            pdf_out._addObject(i)
     pdf_out._root_object.update({
         PDF.NameObject("/Outlines"): idorefs[0]
     })
     outputFile = open(output, "wb")
     pdf_out.write(outputFile)
     inputFile.close()
     outputFile.close()
+
+# See if the page is N * N images, N images written N times,
+# by checking image sizes and within 1 < N <= 10.
+# Return True and N if that's the case.
+def find_redundant_images(caj, initial_offset, images_per_page):
+    sqrts = {
+        4  : 2,
+        9  : 3,
+        16 : 4,
+        25 : 5,
+        36 : 6,
+        49 : 7,
+        64 : 8,
+        81 : 9,
+        100 : 10,
+    }
+
+    if (not (images_per_page in sqrts.keys())):
+        return (False, images_per_page)
+    stride = sqrts[images_per_page]
+    sizes = []
+    current_offset = initial_offset
+    for j in range(images_per_page):
+        caj.seek(current_offset)
+        read32 = caj.read(32)
+        [image_type_enum, offset_to_image_data, size_of_image_data] = struct.unpack("iii", read32[0:12])
+        if ((j >= stride) and (size_of_image_data != sizes[j-stride])):
+            return (False, images_per_page)
+        sizes.append(size_of_image_data)
+        current_offset = offset_to_image_data + size_of_image_data
+    # if we reach here, the image sizes seen are [A, B, C ... N, ..., A, B, C ... N] exactly N times.
+    return (True, stride)