Skip to content

Commit

Permalink
update caj2pdf
Browse files Browse the repository at this point in the history
  • Loading branch information
qinlili23333 committed Dec 20, 2022
1 parent 68352db commit dfbcb7c
Show file tree
Hide file tree
Showing 7 changed files with 168 additions and 34 deletions.
20 changes: 17 additions & 3 deletions printmodule/caj/HNParsePage.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ def __init__(self, data, old_style=False):
def Text(self, code):
try:
self.characters.append(bytes([self.data[self.offset+5],self.data[self.offset+4]]).decode("gbk"))
except IndexError: # short data, nothing to do
pass
except UnicodeDecodeError:
# HTL: When cut-and-paste on Linux, these transform to GB18030,
# but I believe they are OCR artifacts. Where they occur,
Expand Down Expand Up @@ -44,12 +46,24 @@ def TextMulti(self, code):
if (code == 0x8001):
self.characters.append("\n")
while (1):
if (self.data[self.offset+1] == 0x80):
break
self.characters.append(bytes([self.data[self.offset+3],self.data[self.offset+2]]).decode("gbk"))
try:
if (self.data[self.offset+1] == 0x80):
break
except IndexError: # short data, nothing to do
return
try:
self.characters.append(bytes([self.data[self.offset+3],self.data[self.offset+2]]).decode("gbk"))
except UnicodeDecodeError:
self.characters.append("<0x%04X>\n" % (self.data[self.offset+3] * 256 + self.data[self.offset+2]))
except IndexError: # short data, nothing to do
return
self.offset += 4

def Figure(self, code):
try:
self.data[self.offset+25]
except IndexError: # short data, nothing to do
return
(ignore1, offset_x, offset_y, size_x, size_y, int2, int3, int4, int5)= struct.unpack("<HHHHHIIII", self.data[self.offset:self.offset+26])
# in units of 1/2.473 pixels
self.figures.append([offset_x, offset_y, size_x, size_y])
Expand Down
Binary file not shown.
Binary file added printmodule/caj/__pycache__/utils.cpython-310.pyc
Binary file not shown.
110 changes: 89 additions & 21 deletions printmodule/caj/cajparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@
import struct
from shutil import copy
from subprocess import check_output, STDOUT, CalledProcessError
from utils import fnd, fnd_all, add_outlines, fnd_rvrs, fnd_unuse_no
from utils import fnd, fnd_all, add_outlines, fnd_rvrs, fnd_unuse_no, find_redundant_images

try:
from PyPDF2 import errors
except ImportError:
from PyPDF2 import utils as errors

KDH_PASSPHRASE = b"FZHMEI"

Expand Down Expand Up @@ -287,10 +292,23 @@ def _convert_caj(self, dest):
check_output(["mutool", "clean", "pdf.tmp", "pdf_toc.pdf"], stderr=STDOUT)
except CalledProcessError as e:
print(e.output.decode("utf-8"))
raise SystemExit("Command mutool returned non-zero exit status " + str(e.returncode))
print("Command mutool returned non-zero exit status " + str(e.returncode))
print("Try merge mode...")
os.remove("pdf_toc.pdf")
try:
check_output(["mutool", "merge", "-opdf_toc.pdf", "pdf.tmp"], stderr=STDOUT)
except CalledProcessError as e:
print(e.output.decode("utf-8"))
SystemExit("Merge mode also failed.")


# Add Outlines
add_outlines(self.get_toc(), "pdf_toc.pdf", dest)
try:
add_outlines(self.get_toc(), "pdf_toc.pdf", dest)
except errors.PdfReadError as e:
print("errors.PdfReadError:", str(e))
copy("pdf_toc.pdf", dest)
pass
os.remove("pdf.tmp")
os.remove("pdf_toc.pdf")

Expand All @@ -306,11 +324,14 @@ def _convert_hn(self, dest):
[page_data_offset, size_of_text_section, images_per_page, page_no, unk2, next_page_data_offset] = struct.unpack("iihhii", caj.read(20))
caj.seek(page_data_offset)
text_header_read32 = caj.read(32)
if (text_header_read32[8:20] == b'COMPRESSTEXT'):
[expanded_text_size] = struct.unpack("i", text_header_read32[20:24])
if ((text_header_read32[8:20] == b'COMPRESSTEXT') or (text_header_read32[0:12] == b'COMPRESSTEXT')):
coff = 8
if (text_header_read32[0:12] == b'COMPRESSTEXT'):
coff = 0
[expanded_text_size] = struct.unpack("i", text_header_read32[12+coff:16+coff])
import zlib
caj.seek(page_data_offset + 24)
data = caj.read(size_of_text_section - 24)
caj.seek(page_data_offset + 16 + coff)
data = caj.read(size_of_text_section - 16 - coff)
output = zlib.decompress(data, bufsize=expanded_text_size)
if (len(output) != expanded_text_size):
raise SystemExit("Unexpected:", len(output), expanded_text_size)
Expand All @@ -321,13 +342,35 @@ def _convert_hn(self, dest):
page_style = (next_page_data_offset > page_data_offset)
page_data = HNParsePage(output, page_style)

current_offset = page_data_offset + size_of_text_section
(found, images_per_page) = find_redundant_images(caj, current_offset, images_per_page)
if (found):
print("Page %d, skipping %d redundant images" % (i+1, images_per_page * ( images_per_page - 1)))

if (images_per_page > 1):
if (len(page_data.figures) == images_per_page):
image_list.append(None)
image_list.append(page_data.figures)
if (page_data.figures[0][0] == 0) and (page_data.figures[0][1] == 0):
image_list.append(None)
image_list.append(page_data.figures)
else:
print("Page %d, Image Count %d, first image not at origin, expanding to %d pages"
% (i+1, len(page_data.figures), images_per_page))
else:
raise SystemExit("Image Count %d != %d" % (len(page_data.figures), images_per_page))
current_offset = page_data_offset + size_of_text_section
print("Page %d, Image Count %d != %d" % (i+1, len(page_data.figures), images_per_page))
if (len(page_data.figures) > images_per_page):
print("\tTruncating Page %d," % (i+1), page_data.figures)
image_list.append(None)
image_list.append(page_data.figures[0:images_per_page])
else:
print("Page %d expanding to %d separate image pages" % (i+1, images_per_page))
elif (images_per_page == 1):
if ((len(page_data.figures) == 0) or
((len(page_data.figures) > 0) and
(not ((page_data.figures[0][0] == 0) and (page_data.figures[0][1] == 0))))):
print("Page %d possibly text-only + single figure(%d)" % (i+1, len(page_data.figures)))
else:
# don't care about images_per_page == 0
pass
for j in range(images_per_page):
caj.seek(current_offset)
read32 = caj.read(32)
Expand Down Expand Up @@ -380,12 +423,31 @@ def _convert_hn(self, dest):
0
)
elif (image_type[image_type_enum] == "JPEG"):
(height, width) = struct.unpack(">HH", image_data[163:167])
colorspace = Colorspace.RGB
component = 3
# stock libjpeg location
(SOFn, frame_length, bits_per_pixel, height, width, component) = struct.unpack(">HHBHHB", image_data[158:168])
if (SOFn != 0xFFC0):
# "Intel(R) JPEG Library" location
(SOFn, frame_length, bits_per_pixel, height, width, component) = struct.unpack(">HHBHHB", image_data[0x272:0x27c])
if (SOFn != 0xFFC0):
# neither works, try brute-force
import imagesize
from PIL import Image as pilimage
with open(".tmp.jpg", "wb") as f:
f.write(image_data)
(width, height) = imagesize.get(".tmp.jpg")
pim = pilimage.open(".tmp.jpg")
if (pim.mode == 'L'):
component = 1
os.remove(".tmp.jpg")
if (image_type_enum == 1):
# non-inverted JPEG Images
height = -height
if (component == 1):
colorspace = Colorspace.L
image_item = (
Colorspace.RGB,
colorspace,
(300, 300),
ImageFormat.JPEG,
image_data,
Expand Down Expand Up @@ -418,11 +480,14 @@ def _text_extract_hn(self):
[page_data_offset, size_of_text_section, images_per_page, page_no, unk2, next_page_data_offset] = struct.unpack("iihhii", caj.read(20))
caj.seek(page_data_offset)
text_header_read32 = caj.read(32)
if (text_header_read32[8:20] == b'COMPRESSTEXT'):
[expanded_text_size] = struct.unpack("i", text_header_read32[20:24])
if ((text_header_read32[8:20] == b'COMPRESSTEXT') or (text_header_read32[0:12] == b'COMPRESSTEXT')):
coff = 8
if (text_header_read32[0:12] == b'COMPRESSTEXT'):
coff = 0
[expanded_text_size] = struct.unpack("i", text_header_read32[12+coff:16+coff])
import zlib
caj.seek(page_data_offset + 24)
data = caj.read(size_of_text_section - 24)
caj.seek(page_data_offset + 16 + coff)
data = caj.read(size_of_text_section - 16 - coff)
output = zlib.decompress(data, bufsize=expanded_text_size)
if (len(output) != expanded_text_size):
raise SystemExit("Unexpected:", len(output), expanded_text_size)
Expand Down Expand Up @@ -454,12 +519,15 @@ def _parse_hn(self):
# The first 8 bytes are always: 03 80 XX 16 03 80 XX XX,
# the last one 20 or 21, but the first two can be any.
# 48/71 has: 03 80 E0 16 03 80 F7 20, the rest uniq
if (text_header_read32[8:20] == b'COMPRESSTEXT'):
if ((text_header_read32[8:20] == b'COMPRESSTEXT') or (text_header_read32[0:12] == b'COMPRESSTEXT')):
coff = 8
if (text_header_read32[0:12] == b'COMPRESSTEXT'):
coff = 0
# expanded_text_size seems to be always about 2-3 times size_of_text_section, so this is a guess.
[expanded_text_size] = struct.unpack("i", text_header_read32[20:24])
[expanded_text_size] = struct.unpack("i", text_header_read32[12+coff:16+coff])
import zlib
caj.seek(page_data_offset + 24)
data = caj.read(size_of_text_section - 24)
caj.seek(page_data_offset + 16 + coff)
data = caj.read(size_of_text_section - 16 - coff)
output = zlib.decompress(data, bufsize=expanded_text_size)
if (len(output) != expanded_text_size):
print("Unexpected:", len(output), expanded_text_size)
Expand Down
2 changes: 1 addition & 1 deletion printmodule/caj/jbig2dec.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def DecodeJbig2(self):
# PBM is only padded to 8 rather than 32.
# If the padding is larger, write padded file.
if (cimage.bytes_per_line > ((cimage.width +7) >> 3)):
cimage.width = bytes_per_line << 3
cimage.width = cimage.bytes_per_line << 3

with open(sys.argv[2], "wb") as fout:
fout.write("P4\n".encode("ascii"))
Expand Down
5 changes: 4 additions & 1 deletion printmodule/caj/pdfwutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1078,7 +1078,10 @@ def add_multi_imagepage(
image1[PdfName.Height] = -Im_i['imgheightpx']
else:
image1[PdfName.Height] = Im_i['imgheightpx']
image1[PdfName.ColorSpace] = PdfName.DeviceRGB
if Im_i['color'] == Colorspace.L:
image1[PdfName.ColorSpace] = PdfName.DeviceGray
else:
image1[PdfName.ColorSpace] = PdfName.DeviceRGB
image1[PdfName.BitsPerComponent] = Im_i['depth']

offset_x = coordinates[i][0] / 300 * 72 / 2.473
Expand Down
65 changes: 57 additions & 8 deletions printmodule/caj/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import os
import sys
import PyPDF2.pdf as PDF
from PyPDF2 import PdfFileWriter, PdfFileReader
import struct
import PyPDF2.generic as PDF
try:
from PyPDF2 import PdfWriter, PdfReader
except ImportError:
from PyPDF2 import PdfFileWriter as PdfWriter
from PyPDF2 import PdfFileReader as PdfReader


class Node(object):
Expand Down Expand Up @@ -149,7 +154,10 @@ def fnd_unuse_no(nos1, nos2):

def make_dest(pdfw, pg):
d = PDF.ArrayObject()
d.append(pdfw.getPage(pg).indirectRef)
try:
d.append(pdfw.getPage(pg).indirect_ref)
except AttributeError:
d.append(pdfw.getPage(pg).indirectRef)
d.append(PDF.NameObject("/XYZ"))
d.append(PDF.NullObject())
d.append(PDF.NullObject())
Expand Down Expand Up @@ -179,11 +187,14 @@ def build_outlines_btree(toc):

def add_outlines(toc, filename, output):
build_outlines_btree(toc)
pdf_out = PdfFileWriter()
pdf_out = PdfWriter()
inputFile = open(filename, 'rb')
pdf_in = PdfFileReader(inputFile)
pdf_in = PdfReader(inputFile)
for p in pdf_in.pages:
pdf_out.addPage(p)
try:
pdf_out.add_page(p)
except AttributeError:
pdf_out.addPage(p)
toc_num = len(toc)
if (toc_num == 0): # Just copy if toc empty
outputFile = open(output, "wb")
Expand Down Expand Up @@ -217,13 +228,51 @@ def add_outlines(toc, filename, output):
PDF.NameObject(v): idorefs[n.index]
})
olitems.append(oli)
pdf_out._addObject(ol)
try:
pdf_out._add_object(ol)
except AttributeError:
pdf_out._addObject(ol)
for i in olitems:
pdf_out._addObject(i)
try:
pdf_out._add_object(i)
except AttributeError:
pdf_out._addObject(i)
pdf_out._root_object.update({
PDF.NameObject("/Outlines"): idorefs[0]
})
outputFile = open(output, "wb")
pdf_out.write(outputFile)
inputFile.close()
outputFile.close()

# See if the page is N * N images, N images written N times,
# by checking image sizes and within 1 < N <= 10.
# Return True and N if that's the case.
def find_redundant_images(caj, initial_offset, images_per_page):
sqrts = {
4 : 2,
9 : 3,
16 : 4,
25 : 5,
36 : 6,
49 : 7,
64 : 8,
81 : 9,
100 : 10,
}

if (not (images_per_page in sqrts.keys())):
return (False, images_per_page)
stride = sqrts[images_per_page]
sizes = []
current_offset = initial_offset
for j in range(images_per_page):
caj.seek(current_offset)
read32 = caj.read(32)
[image_type_enum, offset_to_image_data, size_of_image_data] = struct.unpack("iii", read32[0:12])
if ((j >= stride) and (size_of_image_data != sizes[j-stride])):
return (False, images_per_page)
sizes.append(size_of_image_data)
current_offset = offset_to_image_data + size_of_image_data
# if we reach here, the image sizes seen are [A, B, C ... N, ..., A, B, C ... N] exactly N times.
return (True, stride)

0 comments on commit dfbcb7c

Please sign in to comment.