-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathprocess_bounding_boxes.py
executable file
·254 lines (204 loc) · 8.72 KB
/
process_bounding_boxes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#!/usr/bin/python
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Process the ImageNet Challenge bounding boxes for TensorFlow model training.
This script is called as
process_bounding_boxes.py <dir> [synsets-file]
Where <dir> is a directory containing the downloaded and unpacked bounding box
data. If [synsets-file] is supplied, then only the bounding boxes whose
synstes are contained within this file are returned. Note that the
[synsets-file] file contains synset ids, one per line.
The script dumps out a CSV text file in which each line contains an entry.
n00007846_64193.JPEG,0.0060,0.2620,0.7545,0.9940
The entry can be read as:
<JPEG file name>, <xmin>, <ymin>, <xmax>, <ymax>
The bounding box for <JPEG file name> contains two points (xmin, ymin) and
(xmax, ymax) specifying the lower-left corner and upper-right corner of a
bounding box in *relative* coordinates.
The user supplies a directory where the XML files reside. The directory
structure in the directory <dir> is assumed to look like this:
<dir>/nXXXXXXXX/nXXXXXXXX_YYYY.xml
Each XML file contains a bounding box annotation. The script:
(1) Parses the XML file and extracts the filename, label and bounding box info.
(2) The bounding box is specified in the XML files as integer (xmin, ymin) and
(xmax, ymax) *relative* to image size displayed to the human annotator. The
size of the image displayed to the human annotator is stored in the XML file
as integer (height, width).
Note that the displayed size will differ from the actual size of the image
downloaded from image-net.org. To make the bounding box annotation useable,
we convert bounding box to floating point numbers relative to displayed
height and width of the image.
Note that each XML file might contain N bounding box annotations.
Note that the points are all clamped at a range of [0.0, 1.0] because some
human annotations extend outside the range of the supplied image.
See details here: http://image-net.org/download-bboxes
(3) By default, the script outputs all valid bounding boxes. If a
[synsets-file] is supplied, only the subset of bounding boxes associated
with those synsets are outputted. Importantly, one can supply a list of
synsets in the ImageNet Challenge and output the list of bounding boxes
associated with the training images of the ILSVRC.
We use these bounding boxes to inform the random distortion of images
supplied to the network.
If you run this script successfully, you will see the following output
to stderr:
> Finished processing 544546 XML files.
> Skipped 0 XML files not in ImageNet Challenge.
> Skipped 0 bounding boxes not in ImageNet Challenge.
> Wrote 615299 bounding boxes from 544546 annotated images.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import glob
import os.path
import sys
import xml.etree.ElementTree as ET
class BoundingBox(object):
pass
def GetItem(name, root, index=0):
count = 0
for item in root.iter(name):
if count == index:
return item.text
count += 1
# Failed to find "index" occurrence of item.
return -1
def GetInt(name, root, index=0):
# In some XML annotation files, the point values are not integers, but floats.
# So we add a float function to avoid ValueError.
return int(float(GetItem(name, root, index)))
def FindNumberBoundingBoxes(root):
index = 0
while True:
if GetInt('xmin', root, index) == -1:
break
index += 1
return index
def ProcessXMLAnnotation(xml_file):
"""Process a single XML file containing a bounding box."""
# pylint: disable=broad-except
try:
tree = ET.parse(xml_file)
except Exception:
print('Failed to parse: ' + xml_file, file=sys.stderr)
return None
# pylint: enable=broad-except
root = tree.getroot()
num_boxes = FindNumberBoundingBoxes(root)
boxes = []
for index in range(num_boxes):
box = BoundingBox()
# Grab the 'index' annotation.
box.xmin = GetInt('xmin', root, index)
box.ymin = GetInt('ymin', root, index)
box.xmax = GetInt('xmax', root, index)
box.ymax = GetInt('ymax', root, index)
box.width = GetInt('width', root)
box.height = GetInt('height', root)
box.filename = GetItem('filename', root) + '.JPEG'
box.label = GetItem('name', root)
xmin = float(box.xmin) / float(box.width)
xmax = float(box.xmax) / float(box.width)
ymin = float(box.ymin) / float(box.height)
ymax = float(box.ymax) / float(box.height)
# Some images contain bounding box annotations that
# extend outside of the supplied image. See, e.g.
# n03127925/n03127925_147.xml
# Additionally, for some bounding boxes, the min > max
# or the box is entirely outside of the image.
min_x = min(xmin, xmax)
max_x = max(xmin, xmax)
box.xmin_scaled = min(max(min_x, 0.0), 1.0)
box.xmax_scaled = min(max(max_x, 0.0), 1.0)
min_y = min(ymin, ymax)
max_y = max(ymin, ymax)
box.ymin_scaled = min(max(min_y, 0.0), 1.0)
box.ymax_scaled = min(max(max_y, 0.0), 1.0)
boxes.append(box)
return boxes
if __name__ == '__main__':
if len(sys.argv) < 2 or len(sys.argv) > 3:
print('Invalid usage\n'
'usage: process_bounding_boxes.py <dir> [synsets-file]',
file=sys.stderr)
sys.exit(-1)
xml_files = glob.glob(sys.argv[1] + '/*/*.xml')
print('Identified %d XML files in %s' % (len(xml_files), sys.argv[1]),
file=sys.stderr)
if len(sys.argv) == 3:
labels = set([l.strip() for l in open(sys.argv[2]).readlines()])
print('Identified %d synset IDs in %s' % (len(labels), sys.argv[2]),
file=sys.stderr)
else:
labels = None
skipped_boxes = 0
skipped_files = 0
saved_boxes = 0
saved_files = 0
for file_index, one_file in enumerate(xml_files):
# Example: <...>/n06470073/n00141669_6790.xml
label = os.path.basename(os.path.dirname(one_file))
# Determine if the annotation is from an ImageNet Challenge label.
if labels is not None and label not in labels:
skipped_files += 1
continue
bboxes = ProcessXMLAnnotation(one_file)
assert bboxes is not None, 'No bounding boxes found in ' + one_file
found_box = False
for bbox in bboxes:
if labels is not None:
if bbox.label != label:
# Note: There is a slight bug in the bounding box annotation data.
# Many of the dog labels have the human label 'Scottish_deerhound'
# instead of the synset ID 'n02092002' in the bbox.label field. As a
# simple hack to overcome this issue, we only exclude bbox labels
# *which are synset ID's* that do not match original synset label for
# the XML file.
if bbox.label in labels:
skipped_boxes += 1
continue
# Guard against improperly specified boxes.
if (bbox.xmin_scaled >= bbox.xmax_scaled or
bbox.ymin_scaled >= bbox.ymax_scaled):
skipped_boxes += 1
continue
# Note bbox.filename occasionally contains '%s' in the name. This is
# data set noise that is fixed by just using the basename of the XML file.
image_filename = os.path.splitext(os.path.basename(one_file))[0]
print('%s.JPEG,%.4f,%.4f,%.4f,%.4f' %
(image_filename,
bbox.xmin_scaled, bbox.ymin_scaled,
bbox.xmax_scaled, bbox.ymax_scaled))
saved_boxes += 1
found_box = True
if found_box:
saved_files += 1
else:
skipped_files += 1
if not file_index % 5000:
print('--> processed %d of %d XML files.' %
(file_index + 1, len(xml_files)),
file=sys.stderr)
print('--> skipped %d boxes and %d XML files.' %
(skipped_boxes, skipped_files), file=sys.stderr)
print('Finished processing %d XML files.' % len(xml_files), file=sys.stderr)
print('Skipped %d XML files not in ImageNet Challenge.' % skipped_files,
file=sys.stderr)
print('Skipped %d bounding boxes not in ImageNet Challenge.' % skipped_boxes,
file=sys.stderr)
print('Wrote %d bounding boxes from %d annotated images.' %
(saved_boxes, saved_files),
file=sys.stderr)
print('Finished.', file=sys.stderr)