-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxmp_parser.py
64 lines (57 loc) · 2.21 KB
/
xmp_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from collections import defaultdict
from xml.etree import ElementTree as ET
RDF_NS = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}'
XML_NS = '{http://www.w3.org/XML/1998/namespace}'
NS_MAP = {
'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',
'http://purl.org/dc/elements/1.1/': 'dc',
'http://ns.adobe.com/xap/1.0/': 'xap',
'http://ns.adobe.com/pdf/1.3/': 'pdf',
'http://ns.adobe.com/xap/1.0/mm/': 'xapmm',
'http://ns.adobe.com/pdfx/1.3/': 'pdfx',
'http://prismstandard.org/namespaces/basic/2.0/': 'prism',
'http://crossref.org/crossmark/1.0/': 'crossmark',
'http://ns.adobe.com/xap/1.0/rights/': 'rights',
'http://www.w3.org/XML/1998/namespace': 'xml'
}
class XmpParser(object):
def __init__(self, xmp):
self.tree = ET.XML(xmp)
self.rdftree = self.tree.find(RDF_NS + 'RDF')
@property
def read_meta(self):
""" A dictionary of all the parsed metadata. """
meta = defaultdict(dict)
for desc in self.rdftree.findall(RDF_NS + 'Description'):
#for el in desc.getchildren():
for el in list(desc):
ns, tag = self._parse_tag(el)
value = self._parse_value(el)
meta[ns][tag] = value
return dict(meta)
def _parse_tag(self, el):
""" Extract the namespace and tag from an element. """
ns = None
tag = el.tag
if tag[0] == "{":
ns, tag = tag[1:].split('}', 1)
if ns in NS_MAP:
ns = NS_MAP[ns]
return ns, tag
def _parse_value(self, el):
""" Extract the metadata value from an element. """
if el.find(RDF_NS + 'Bag') is not None:
value = []
for li in el.findall(RDF_NS + 'Bag/' + RDF_NS + 'li'):
value.append(li.text)
elif el.find(RDF_NS + 'Seq') is not None:
value = []
for li in el.findall(RDF_NS + 'Seq/' + RDF_NS + 'li'):
value.append(li.text)
elif el.find(RDF_NS + 'Alt') is not None:
value = {}
for li in el.findall(RDF_NS + 'Alt/' + RDF_NS + 'li'):
value[li.get(XML_NS + 'lang')] = li.text
else:
value = el.text
return value