diff --git a/printmodule/caj/HNParsePage.py b/printmodule/caj/HNParsePage.py index 5d13191..040558d 100644 --- a/printmodule/caj/HNParsePage.py +++ b/printmodule/caj/HNParsePage.py @@ -15,6 +15,8 @@ def __init__(self, data, old_style=False): def Text(self, code): try: self.characters.append(bytes([self.data[self.offset+5],self.data[self.offset+4]]).decode("gbk")) + except IndexError: # short data, nothing to do + pass except UnicodeDecodeError: # HTL: When cut-and-paste on Linux, these transform to GB18030, # but I believe they are OCR artifacts. Where they occur, @@ -44,12 +46,24 @@ def TextMulti(self, code): if (code == 0x8001): self.characters.append("\n") while (1): - if (self.data[self.offset+1] == 0x80): - break - self.characters.append(bytes([self.data[self.offset+3],self.data[self.offset+2]]).decode("gbk")) + try: + if (self.data[self.offset+1] == 0x80): + break + except IndexError: # short data, nothing to do + return + try: + self.characters.append(bytes([self.data[self.offset+3],self.data[self.offset+2]]).decode("gbk")) + except UnicodeDecodeError: + self.characters.append("<0x%04X>\n" % (self.data[self.offset+3] * 256 + self.data[self.offset+2])) + except IndexError: # short data, nothing to do + return self.offset += 4 def Figure(self, code): + try: + self.data[self.offset+25] + except IndexError: # short data, nothing to do + return (ignore1, offset_x, offset_y, size_x, size_y, int2, int3, int4, int5)= struct.unpack(" page_data_offset) page_data = HNParsePage(output, page_style) + current_offset = page_data_offset + size_of_text_section + (found, images_per_page) = find_redundant_images(caj, current_offset, images_per_page) + if (found): + print("Page %d, skipping %d redundant images" % (i+1, images_per_page * ( images_per_page - 1))) + if (images_per_page > 1): if (len(page_data.figures) == images_per_page): - image_list.append(None) - image_list.append(page_data.figures) + if (page_data.figures[0][0] == 0) and (page_data.figures[0][1] == 0): + image_list.append(None) + image_list.append(page_data.figures) + else: + print("Page %d, Image Count %d, first image not at origin, expanding to %d pages" + % (i+1, len(page_data.figures), images_per_page)) else: - raise SystemExit("Image Count %d != %d" % (len(page_data.figures), images_per_page)) - current_offset = page_data_offset + size_of_text_section + print("Page %d, Image Count %d != %d" % (i+1, len(page_data.figures), images_per_page)) + if (len(page_data.figures) > images_per_page): + print("\tTruncating Page %d," % (i+1), page_data.figures) + image_list.append(None) + image_list.append(page_data.figures[0:images_per_page]) + else: + print("Page %d expanding to %d separate image pages" % (i+1, images_per_page)) + elif (images_per_page == 1): + if ((len(page_data.figures) == 0) or + ((len(page_data.figures) > 0) and + (not ((page_data.figures[0][0] == 0) and (page_data.figures[0][1] == 0))))): + print("Page %d possibly text-only + single figure(%d)" % (i+1, len(page_data.figures))) + else: + # don't care about images_per_page == 0 + pass for j in range(images_per_page): caj.seek(current_offset) read32 = caj.read(32) @@ -380,12 +423,31 @@ def _convert_hn(self, dest): 0 ) elif (image_type[image_type_enum] == "JPEG"): - (height, width) = struct.unpack(">HH", image_data[163:167]) + colorspace = Colorspace.RGB + component = 3 + # stock libjpeg location + (SOFn, frame_length, bits_per_pixel, height, width, component) = struct.unpack(">HHBHHB", image_data[158:168]) + if (SOFn != 0xFFC0): + # "Intel(R) JPEG Library" location + (SOFn, frame_length, bits_per_pixel, height, width, component) = struct.unpack(">HHBHHB", image_data[0x272:0x27c]) + if (SOFn != 0xFFC0): + # neither works, try brute-force + import imagesize + from PIL import Image as pilimage + with open(".tmp.jpg", "wb") as f: + f.write(image_data) + (width, height) = imagesize.get(".tmp.jpg") + pim = pilimage.open(".tmp.jpg") + if (pim.mode == 'L'): + component = 1 + os.remove(".tmp.jpg") if (image_type_enum == 1): # non-inverted JPEG Images height = -height + if (component == 1): + colorspace = Colorspace.L image_item = ( - Colorspace.RGB, + colorspace, (300, 300), ImageFormat.JPEG, image_data, @@ -418,11 +480,14 @@ def _text_extract_hn(self): [page_data_offset, size_of_text_section, images_per_page, page_no, unk2, next_page_data_offset] = struct.unpack("iihhii", caj.read(20)) caj.seek(page_data_offset) text_header_read32 = caj.read(32) - if (text_header_read32[8:20] == b'COMPRESSTEXT'): - [expanded_text_size] = struct.unpack("i", text_header_read32[20:24]) + if ((text_header_read32[8:20] == b'COMPRESSTEXT') or (text_header_read32[0:12] == b'COMPRESSTEXT')): + coff = 8 + if (text_header_read32[0:12] == b'COMPRESSTEXT'): + coff = 0 + [expanded_text_size] = struct.unpack("i", text_header_read32[12+coff:16+coff]) import zlib - caj.seek(page_data_offset + 24) - data = caj.read(size_of_text_section - 24) + caj.seek(page_data_offset + 16 + coff) + data = caj.read(size_of_text_section - 16 - coff) output = zlib.decompress(data, bufsize=expanded_text_size) if (len(output) != expanded_text_size): raise SystemExit("Unexpected:", len(output), expanded_text_size) @@ -454,12 +519,15 @@ def _parse_hn(self): # The first 8 bytes are always: 03 80 XX 16 03 80 XX XX, # the last one 20 or 21, but the first two can be any. # 48/71 has: 03 80 E0 16 03 80 F7 20, the rest uniq - if (text_header_read32[8:20] == b'COMPRESSTEXT'): + if ((text_header_read32[8:20] == b'COMPRESSTEXT') or (text_header_read32[0:12] == b'COMPRESSTEXT')): + coff = 8 + if (text_header_read32[0:12] == b'COMPRESSTEXT'): + coff = 0 # expanded_text_size seems to be always about 2-3 times size_of_text_section, so this is a guess. - [expanded_text_size] = struct.unpack("i", text_header_read32[20:24]) + [expanded_text_size] = struct.unpack("i", text_header_read32[12+coff:16+coff]) import zlib - caj.seek(page_data_offset + 24) - data = caj.read(size_of_text_section - 24) + caj.seek(page_data_offset + 16 + coff) + data = caj.read(size_of_text_section - 16 - coff) output = zlib.decompress(data, bufsize=expanded_text_size) if (len(output) != expanded_text_size): print("Unexpected:", len(output), expanded_text_size) diff --git a/printmodule/caj/jbig2dec.py b/printmodule/caj/jbig2dec.py index f344101..f213a3a 100644 --- a/printmodule/caj/jbig2dec.py +++ b/printmodule/caj/jbig2dec.py @@ -62,7 +62,7 @@ def DecodeJbig2(self): # PBM is only padded to 8 rather than 32. # If the padding is larger, write padded file. if (cimage.bytes_per_line > ((cimage.width +7) >> 3)): - cimage.width = bytes_per_line << 3 + cimage.width = cimage.bytes_per_line << 3 with open(sys.argv[2], "wb") as fout: fout.write("P4\n".encode("ascii")) diff --git a/printmodule/caj/pdfwutils.py b/printmodule/caj/pdfwutils.py index 34963fa..4115221 100644 --- a/printmodule/caj/pdfwutils.py +++ b/printmodule/caj/pdfwutils.py @@ -1078,7 +1078,10 @@ def add_multi_imagepage( image1[PdfName.Height] = -Im_i['imgheightpx'] else: image1[PdfName.Height] = Im_i['imgheightpx'] - image1[PdfName.ColorSpace] = PdfName.DeviceRGB + if Im_i['color'] == Colorspace.L: + image1[PdfName.ColorSpace] = PdfName.DeviceGray + else: + image1[PdfName.ColorSpace] = PdfName.DeviceRGB image1[PdfName.BitsPerComponent] = Im_i['depth'] offset_x = coordinates[i][0] / 300 * 72 / 2.473 diff --git a/printmodule/caj/utils.py b/printmodule/caj/utils.py index 3c14edf..aad7249 100644 --- a/printmodule/caj/utils.py +++ b/printmodule/caj/utils.py @@ -1,7 +1,12 @@ import os import sys -import PyPDF2.pdf as PDF -from PyPDF2 import PdfFileWriter, PdfFileReader +import struct +import PyPDF2.generic as PDF +try: + from PyPDF2 import PdfWriter, PdfReader +except ImportError: + from PyPDF2 import PdfFileWriter as PdfWriter + from PyPDF2 import PdfFileReader as PdfReader class Node(object): @@ -149,7 +154,10 @@ def fnd_unuse_no(nos1, nos2): def make_dest(pdfw, pg): d = PDF.ArrayObject() - d.append(pdfw.getPage(pg).indirectRef) + try: + d.append(pdfw.getPage(pg).indirect_ref) + except AttributeError: + d.append(pdfw.getPage(pg).indirectRef) d.append(PDF.NameObject("/XYZ")) d.append(PDF.NullObject()) d.append(PDF.NullObject()) @@ -179,11 +187,14 @@ def build_outlines_btree(toc): def add_outlines(toc, filename, output): build_outlines_btree(toc) - pdf_out = PdfFileWriter() + pdf_out = PdfWriter() inputFile = open(filename, 'rb') - pdf_in = PdfFileReader(inputFile) + pdf_in = PdfReader(inputFile) for p in pdf_in.pages: - pdf_out.addPage(p) + try: + pdf_out.add_page(p) + except AttributeError: + pdf_out.addPage(p) toc_num = len(toc) if (toc_num == 0): # Just copy if toc empty outputFile = open(output, "wb") @@ -217,9 +228,15 @@ def add_outlines(toc, filename, output): PDF.NameObject(v): idorefs[n.index] }) olitems.append(oli) - pdf_out._addObject(ol) + try: + pdf_out._add_object(ol) + except AttributeError: + pdf_out._addObject(ol) for i in olitems: - pdf_out._addObject(i) + try: + pdf_out._add_object(i) + except AttributeError: + pdf_out._addObject(i) pdf_out._root_object.update({ PDF.NameObject("/Outlines"): idorefs[0] }) @@ -227,3 +244,35 @@ def add_outlines(toc, filename, output): pdf_out.write(outputFile) inputFile.close() outputFile.close() + +# See if the page is N * N images, N images written N times, +# by checking image sizes and within 1 < N <= 10. +# Return True and N if that's the case. +def find_redundant_images(caj, initial_offset, images_per_page): + sqrts = { + 4 : 2, + 9 : 3, + 16 : 4, + 25 : 5, + 36 : 6, + 49 : 7, + 64 : 8, + 81 : 9, + 100 : 10, + } + + if (not (images_per_page in sqrts.keys())): + return (False, images_per_page) + stride = sqrts[images_per_page] + sizes = [] + current_offset = initial_offset + for j in range(images_per_page): + caj.seek(current_offset) + read32 = caj.read(32) + [image_type_enum, offset_to_image_data, size_of_image_data] = struct.unpack("iii", read32[0:12]) + if ((j >= stride) and (size_of_image_data != sizes[j-stride])): + return (False, images_per_page) + sizes.append(size_of_image_data) + current_offset = offset_to_image_data + size_of_image_data + # if we reach here, the image sizes seen are [A, B, C ... N, ..., A, B, C ... N] exactly N times. + return (True, stride)