-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodules.py
385 lines (340 loc) · 14.9 KB
/
modules.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
import re
import re
import pandas as pd
from datetime import datetime
#import nlu_module as mod
import pandas as pd
#from nlu_module import *
def is_search_query(query):
# Implement detection logic for search queries
search_keywords = [
"show all", "show me","tell me","tell all","list","list all","options",
"information on"
]
#print("Yes searched query")
for i in search_keywords:
if i in query.lower() :
return True
return False
#return any(keyword in query.lower() for keyword in search_keywords)
# Function to check if the query is a count query
def filter_flights(flights_df, features):
# Prepare query conditions
query_conditions = []
# Add conditions based on features
if features.get('date'):
query_conditions.append(f"`date` == '{features['date']}'")
if features.get('airline'):
query_conditions.append(f"`airline` == '{features['airline']}'")
if features.get('from'):
query_conditions.append(f"`from` == '{features['from']}'")
if features.get('to'):
query_conditions.append(f"`to` == '{features['to']}'")
flights_df['price'] = pd.to_numeric(flights_df['price'], errors='coerce')
# Handle price range conditions
if features.get('price_min') is not None and features.get('price_max') is not None:
query_conditions.append(f"`price` >= {features['price_min']} & `price` <= {features['price_max']}")
elif features.get('price_min') is not None:
query_conditions.append(f"`price` >= {features['price_min']}")
elif features.get('price_max') is not None:
query_conditions.append(f"`price` <= {features['price_max']}")
# Join all conditions to form the query string
if query_conditions:
query = " & ".join(query_conditions)
else:
query = "True"
# Apply the query and handle exceptions
try:
filtered_flights = flights_df.query(query)
columns_of_interest = ['date', 'num_code', 'dep_time', 'arr_time', 'from', 'to', 'airline', 'price']
filtered_flights = filtered_flights[columns_of_interest]
return filtered_flights
except Exception as e:
print(f"Error applying query: {e}")
return pd.DataFrame()
count_patterns = [
r"how many flights", r"number of flights", r"total flights",
r"can you tell me the number", r"number of planes", r"how many options",
r"how many departures", r"tell me the available flights",
r"available flights", r"total flights count", r"count of flights",
r"flight count"
]
synonym_map = {
'flights': [
'aeroplanes', 'aircrafts', 'airflights', 'planes',
'airliners', 'jetliners', 'flight options', 'flight schedules'
],
'options': [
'choices', 'alternatives', 'selections', 'possibilities',
'varieties'
],
'departures': [
'departures', 'leavings', 'outgoings'
],
'from': [
'departure from', 'starting point', 'origin', 'departing from'
],
'to': [
'destination', 'arriving at', 'going to', 'traveling to' ,"destination is" ,"destination at"
],
"Bangalore" :["Banglore","Bengaluru","Bangaluru","Bengalore","Bngalore"],
"Hyderabad" :["Hydrebad","Hydrabad","Hyderbad","Hderabad"],
"Delhi" : ["New Delhi","Dehli","Dilli","Dillhi","Delhii"],
"Chennai" :["Chenai","Madras","Channai"],
"Kolkata" :["Calcut","Calcutta","Kolkota","Kolkatha"],
"Mumbai":["Bombay","Bombai","Mombai","Mumbay","Mumbaai"],
"under":["below","less than","under the limit","at most"],
"equal to" :["equivalent to","exactly"],
"above":["at least","higher than","more than","greater than"]
}
def replace_synonyms(query, synonym_map):
# Ensure query is a string; if it's a dictionary, extract the relevant part
if isinstance(query, dict):
query = query.get('query', '') # Extract the query part or default to an empty string
# Proceed with synonym replacement
for internal_term, synonyms in synonym_map.items():
for synonym in synonyms:
query = re.sub(rf'\b{re.escape(synonym)}\b', internal_term, query, flags=re.IGNORECASE)
return query
def match_internal_patterns(query, internal_patterns):
# Dictionary to store matched attributes
attributes = {
'from': None,
'to': None,
'date': None,
'airline': None,
'price_min': None,
'price_max': None
}
# Iterate over patterns and find matches for 'from', 'to', etc.
for attr, patterns in internal_patterns.items():
for pattern in patterns:
match = re.search(pattern, query, re.IGNORECASE)
if match:
if attr in ['from', 'to']: # Extract city for 'from' and 'to'
attributes[attr] = match.group(1).strip() # Get the matched city
else:
# For non-city attributes like 'airline' or 'date', flag them as matched
attributes[attr] = True
return attributes
def extract_attributes(user_query):
user_query = replace_synonyms(user_query, synonym_map)
#print(user_query)
# Match query against internal patterns
attributes = {
'date': None,
'from': None,
'to': None,
'airline': None,
'price_min': None,
'price_max': None
}
valid_cities = ['Chennai', 'Mumbai', 'Hyderabad', 'Delhi', 'Kolkata', 'Bangalore']
# Regex patterns
from_pattern = r'(?i)\bfrom\s+(\w+)\b'
to_pattern = r'(?i)\bto\s+(\w+)\b'
# Patterns to extract 'from' and 'to' locations
#from_pattern = r'(?i)\b(?:from|leaving from|departing from)\s+([A-Za-z]+)\b'
#to_pattern = r'(?i)\b(?:to|towards)\s+([A-Za-z]+)\b'
# Date patterns
date_patterns = [
r'(?i)\b(\d{1,2}(?:st|nd|rd|th)? (?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)(?: \d{4})?)\b',
r'(?i)\b(\d{1,2} (?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)(?: \d{4})?)\b',
r'(?i)\b(\d{1,2}[-/](?:0[1-9]|1[0-2])[-/](?:\d{2}|\d{4}))\b',
r'(?i)\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?) \d{1,2}\b',
r'(?i)\b(\d{1,2}(?:st|nd|rd|th)? of (?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)(?: \d{4})?)\b',
r'(?i)\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?) \d{1,2}(?:st|nd|rd|th)?\b'
]
# Extract date
for pattern in date_patterns:
date_match = re.search(pattern, user_query)
if date_match:
date_str = date_match.group(0)
normalized_date = normalize_date(date_str)
if normalized_date:
attributes['date'] = normalized_date
break
# Extract 'from' location
from_match = re.search(from_pattern, user_query)
if from_match:
closest_city = from_match.group(1).capitalize()
attributes['from'] = closest_city
# Extract 'to' location
# Extract 'to' location
to_match = re.search(to_pattern, user_query)
if to_match:
city = to_match.group(1).capitalize()
attributes['to'] = city
# Extract airline
known_airlines = ['Vistara', 'Air India'] # Add more airlines as needed
for airline in known_airlines:
if airline.lower() in user_query.lower():
attributes['airline'] = airline
# Extract price conditions
price_conditions = {
'under': '<=',
'below': '<=',
'less than': '<',
'equal to': '==',
'greater than': '>',
'more than': '>',
'at least': '>=',
'more than or equal to': '>='
}
for condition in price_conditions:
price_match = re.search(rf'{condition}\s*(\d+)', user_query, re.IGNORECASE)
if price_match:
price_value = int(price_match.group(1))
if condition in ['under', 'below', 'less than']:
attributes['price_max'] = price_value
attributes['price_min'] = 0
elif condition in ['greater than', 'more than', 'at least']:
attributes['price_min'] = price_value
attributes['price_max'] = float('inf')
elif condition == 'equal to':
attributes['price_min'] = price_value
attributes['price_max'] = price_value
# Default price range if not set
if attributes['price_min'] is None:
attributes['price_min'] = 0
if attributes['price_max'] is None:
attributes['price_max'] = float('inf')
return attributes
def is_count_query(user_query):
# Normalize the user query
normalized_query = normalize_query(user_query, synonym_map)
# Check if it matches any count pattern
normalized_query = normalized_query.lower() # Ensure case insensitivity
for pattern in count_patterns:
if re.search(pattern, normalized_query):
return True
return False
count_patterns = [
r"how many flights", r"number of flights", r"total flights",
r"can you tell me the number", r"number of planes", r"how many options",
r"how many departures", r"tell me the available flights",
r"available flights", r"total flights count", r"count of flights",
r"flight count"
]
from math import inf
def preprocess_prices(df):
# Remove commas from the 'price' column and convert to integer
df['price'] = df['price'].replace(',', '', regex=True).astype(int)
#print(df)
return df
def extract_price_conditions(query):
# Initialize default values
price_min = 0
price_max = inf
# Regex patterns for price extraction
price_patterns = {
'between': r'\b(?:between|from)\s*(\d+)\s*(?:and|to)\s*(\d+)\b',
'above_below': r'\b(?:above|greater than)\s*(\d+)\s*(?:but\s*below|and\s*less than)\s*(\d+)\b',
'greater_less': r'\b(?:greater than|more than)\s*(\d+)\s*(?:but\s*less than|and\s*below)\s*(\d+)\b',
'under': r'\b(?:under|below|at most|up to|less than)\s*(\d+)\b',
'above': r'\b(?:above|greater than|more than)\s*(\d+)\b',
'exactly': r'\b(?:exactly|equal to)\s*(\d+)\b'
}
# Search for patterns in the query
for key, pattern in price_patterns.items():
match = re.search(pattern, query, re.IGNORECASE)
if match:
if key == 'under':
price_max = int(match.group(1))
elif key == 'between':
price_min = int(match.group(1))
price_max = int(match.group(2))
elif key in ['above_below', 'greater_less']:
price_min = int(match.group(1))
price_max = int(match.group(2))
elif key == 'above':
price_min = int(match.group(1))
price_max = inf
elif key == 'exactly':
price_min = int(match.group(1))
price_max = int(match.group(1))
break
return price_min, price_max
def normalize_query(query, synonym_map):
# Ensure query is a string, if it's a dict, convert or extract the relevant field
if isinstance(query, dict):
query = query.get('query', '') # Extract the query part if it's stored in a 'query' key, or default to an empty string
# Proceed with string normalization
query = query.lower()
for standard_term, synonyms in synonym_map.items():
for synonym in synonyms:
# Replace synonyms with the standard term
if synonym in query:
query = re.sub(r'\b{}\b'.format(re.escape(synonym)), standard_term, query)
return query
# Define known airlines
known_airlines = ['Air India', 'Vistara']
# Define city names for normalization
city_names = ['Chennai', 'Mumbai', 'Delhi',"Kolkata","Bangalore","Hyderabad"]
# Function to normalize city names
def normalize_city(city_name):
# Normalize city name by capitalizing the first letter
city_name = city_name.title().strip()
if city_name in city_names:
return city_name
return None
def normalize_date(date_str):
# Set default year
default_year = 2022
# Define month names and their abbreviations
month_names = {
'january': 1, 'february': 2, 'march': 3, 'april': 4,
'may': 5, 'june': 6, 'july': 7, 'august': 8,
'september': 9, 'october': 10, 'november': 11, 'december': 12,
'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5,
'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10,
'nov': 11, 'dec': 12
}
# Remove ordinal suffixes
date_str = re.sub(r'(\d+)(?:st|nd|rd|th)', r'\1', date_str, flags=re.IGNORECASE)
# Handle month before day (e.g., February 12)
month_names_lower = {k.lower(): v for k, v in month_names.items()}
for month_str, month_number in month_names_lower.items():
pattern = re.compile(rf'({month_str})(?:\s+|\s*)(\d{{1,2}})', re.IGNORECASE)
match = pattern.search(date_str)
if match:
month = str(month_number).zfill(2)
day = match.group(2).zfill(2)
return f'{day}-{month}-{default_year}'
# Handle direct month and day concatenated formats
for month_name, month_number in month_names.items():
pattern = re.compile(rf'(\d{{1,2}})(?:\s*{month_name}|{month_name})(?!\d)', re.IGNORECASE)
match = pattern.search(date_str)
if match:
day = match.group(1).zfill(2)
month = str(month_number).zfill(2)
return f'{day}-{month}-{default_year}'
# Handle month and day with separators or spaces
date_str = re.sub(
r'(\d{1,2})\s*(?:[-/.,\s]*\s*|(?:of\s*)?)(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)',
r'\1 \2',
date_str,
flags=re.IGNORECASE
)
# Match the date pattern with month names
match = re.search(r'(\d{1,2})\s*(\w+)', date_str, re.IGNORECASE)
if match:
day = match.group(1).zfill(2)
month_str = match.group(2).lower()
month = month_names_lower.get(month_str)
if month:
return f'{day}-{str(month).zfill(2)}-{default_year}'
# Handle numerical month/day formats like "12/02" or "12-02"
match = re.search(r'(\d{1,2})\s*[-/]\s*(\d{1,2})', date_str)
if match:
day = match.group(1).zfill(2)
month = match.group(2).zfill(2)
return f'{day}-{month}-{default_year}'
# Handle formats with dots and year in different positions
match = re.search(r'(\d{1,2})[./-](\d{1,2})(?:[./-](\d{4}))?', date_str)
if match:
day = match.group(1).zfill(2)
month = match.group(2).zfill(2)
year = match.group(3) if match.group(3) else default_year
return f'{day}-{month}-{year}'
return None