-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpopulate_database.py
102 lines (85 loc) · 3.46 KB
/
populate_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
from urllib import request
import ast
from lxml import html
from pymongo import MongoClient
connection = MongoClient("mongodb://localhost:27017/")
db = connection.myFirstDatabase
def notValidFood(tree):
return tree.find_class('labelnotavailable')
def getIngredients(tree):
ingElement = tree.find_class('labelingredientsvalue')
try:
ingString = ingElement[0].text_content()
except:
return "Ingredients not parsable"
ingString = "[{\"name\": \"" + ingString + "\"}]"
ingString = ingString.replace(")", "\"}]!")
ingString = ingString.replace(", ", "\"}, {\"name\": \"")
ingString = ingString.replace("(", "\", \"subingredients\": [{\"name\": \"")
ingString = ingString.replace("!\"", "")
try:
return ast.literal_eval(ingString)
except:
return "Ingredients not parsable"
def getAllergens(tree):
allergens = tree.find_class('labelallergensvalue')
try:
allergenString = allergens[0].text_content()
except:
return "Allergens not parseable"
allergenList = allergenString.split(', ')
return allergenList
def getCalories(html):
try:
string1 = "Calories "
string2 = "</b>"
start = html.index(string1) + len(string1)
end = html.index(string2,start)
return int(html[start:end])
except:
return -1
if __name__ == "__main__":
notParsable = []
inserted = []
numarr = []
with open(os.path.expanduser("validrecipes.txt")) as f:
for line in f:
line = line.split() # to deal with blank
if line: # lines (ie skip them)
line = [int(i) for i in line]
numarr.append(line[0])
for i in numarr:
try:
index = str(i).zfill(6)
# brute force all the files, using the base url for all of the foods
url = "http://menus.tufts.edu/FoodPro%203.1.NET/label.aspx?locationNum=09&RecNumAndPort=" + index
page = request.urlopen(url)
htmlSource = page.read()
page.close()
tree = html.fromstring(htmlSource)
if (notValidFood(tree)):
page = request.urlopen("http://menus.tufts.edu/FoodPro%203.1.NET/label.aspx?locationNum=11&RecNumAndPort=" + index)
htmlSource = page.read()
page.close()
tree = html.fromstring(htmlSource)
if (notValidFood(tree)):
print('not valid: ' + index)
continue
foodname = tree.find_class('labelrecipe')[0].text_content().strip(' ').lower()
if db.ingredients.find_one({"name": foodname}) is not None:
print('already in db: ' + index)
continue
toAddIng = { "ingredients": getIngredients(tree), "name": foodname, "allergens": getAllergens(tree), "calories": getCalories(htmlSource)}
if toAddIng["ingredients"] == "Ingredients not parsable":
print('ingredients not parsable: ' + index)
notParsable.append(i)
print('inserting: ' + index)
db.ingredients.insert_one(toAddIng)
inserted.append(i)
except IOError:
print('error: ' + str(i))
continue
db.urldata.insert_one({"notParsable": notParsable, "inserted": inserted})
# toAddNutrition = { "nutrition": getNutrition(tree), "name": foodname}
# ingdata collection has allergen information, ingredients does not