From 849bab43d68164df9c0b8c796598cc2d6e9e3b2e Mon Sep 17 00:00:00 2001 From: TTalex Date: Fri, 11 Aug 2023 16:10:01 +0200 Subject: [PATCH 1/9] Small UTC date tweaks to avoid problems on utc positive tz --- pages/home.py | 2 +- utils/db_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pages/home.py b/pages/home.py index 3aee59a..1b27f5d 100644 --- a/pages/home.py +++ b/pages/home.py @@ -64,7 +64,7 @@ def compute_sign_up_trend(uuid_df): def compute_trips_trend(trips_df, date_col): - trips_df[date_col] = pd.to_datetime(trips_df[date_col], utc=True) + trips_df[date_col] = pd.to_datetime(trips_df[date_col], utc=True, format='ISO8601') trips_df[date_col] = pd.DatetimeIndex(trips_df[date_col]).date res_df = ( trips_df diff --git a/utils/db_utils.py b/utils/db_utils.py index 1500633..c0deb42 100644 --- a/utils/db_utils.py +++ b/utils/db_utils.py @@ -47,7 +47,7 @@ def query_uuids(start_date, end_date): return df def query_confirmed_trips(start_date, end_date): - start_ts, end_ts = None, datetime.max.timestamp() + start_ts, end_ts = None, datetime.max.replace(tzinfo=timezone.utc).timestamp() if start_date is not None: start_ts = datetime.combine(start_date, datetime.min.time()).timestamp() From 286ce3feadc986e56b116cdc49a6bd91952dea02 Mon Sep 17 00:00:00 2001 From: TTalex Date: Fri, 11 Aug 2023 16:53:58 +0200 Subject: [PATCH 2/9] Feature: Added segment trip time page --- README.md | 7 +- app_sidebar_collapsible.py | 10 ++ pages/segment_trip_time.py | 326 +++++++++++++++++++++++++++++++++++++ requirements.txt | 9 + utils/db_utils.py | 86 +++++++++- 5 files changed, 436 insertions(+), 2 deletions(-) create mode 100644 pages/segment_trip_time.py diff --git a/README.md b/README.md index fb4be7a..d383f4a 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ pip install -r dashboard/requirements.txt Run the app: ``` -python app.py +python app_sidebar_collapsible.py ``` You can run the app on your browser at http://127.0.0.1:8050 @@ -133,6 +133,11 @@ specified in the following sections. - `map_bubble`: User can view the bubble map in the Map page. - `map_trip_lines`: User can view the trip lines map in the Map page. +### Segment Trip Time Page +- `segment_trip_time`: User can view this page. (default `true`) +- `segment_trip_time_full_trips`: User can see the table containing non-agregated data (default `true`) +- `segment_trip_time_min_users`: Minimal number of distinct users in data required to display anything (value is a number, default `0`). + ### Push Notification Page - `push_send`: User can send push notifications in the Push Notification page. diff --git a/app_sidebar_collapsible.py b/app_sidebar_collapsible.py index 64077db..a3abd9d 100644 --- a/app_sidebar_collapsible.py +++ b/app_sidebar_collapsible.py @@ -99,6 +99,15 @@ href=dash.get_relative_path("/map"), active="exact", ), + dbc.NavLink( + [ + html.I(className="fas fa-solid fa-hourglass me-2"), + html.Span("Segment trip time"), + ], + href=dash.get_relative_path("/segment_trip_time"), + active="exact", + style={'display': 'block' if has_permission('segment_trip_time') else 'none'}, + ), dbc.NavLink( [ html.I(className="fas fa-solid fa-envelope-open-text me-2"), @@ -186,6 +195,7 @@ def update_store_uuids(start_date, end_date): return store +# Note: this triggers twice on load, not great with a slow db @app.callback( Output("store-trips", "data"), Input('date-picker', 'start_date'), diff --git a/pages/segment_trip_time.py b/pages/segment_trip_time.py new file mode 100644 index 0000000..8b38566 --- /dev/null +++ b/pages/segment_trip_time.py @@ -0,0 +1,326 @@ +from dash import dcc, html, Input, Output, State, callback, register_page, dash_table +import dash_bootstrap_components as dbc +import dash_leaflet as dl +import pandas as pd + +import emission.core.wrapper.modeprediction as ecwm +import logging + +from utils.permissions import has_permission, permissions +from utils import db_utils + +register_page(__name__, path="/segment_trip_time") + +intro = """ +## Segment average trip time +This page displays some statistics on average trip duration between two selected points. +""" + +first_step = """ +### First, select a detection radius. +This is the range in meters around start and end point where GPS data can be detected, closest data is prioritized. + +This should somewhat match the distance a vehicle can cross at the maximum allowed speed in 30 seconds (sample rate). + +A bigger number means more trips will be considered, however it might find trips on the wrong road if roads are close enough. +""" + +second_step = """ +### Then, select start and end points +These can be anywhere on the map, but are usually on road intersections. + +A circle will be shown with the detection radius set on the previous step. For accurate results, the circle should not cover more than one parallel road. +""" + +not_enough_data_message = f""" +Not enough segments could be found between endpoints. This means that the number of recorded trips going from start to end point is too low. +* There could be data, but on an insufficient number of users, breaking anonymity (minimum number of users is currently set to {permissions.get('segment_trip_time_min_users', 0)}) +* You could try to increase the detection radius, or chose different start and end points. +""" + +layout = html.Div( + [ + dcc.Store(id='link-trip-time-start', data=(0, 0)), + dcc.Store(id='link-trip-time-end', data=(0, 0)), + # dcc.Store(id='store-mode-by-section-id', data={}), + dcc.Markdown(intro), + dcc.Markdown(first_step), + dcc.Slider( + 0, + 2500, + id='detection-radius', + value=200, + tooltip={"placement": "bottom", "always_visible": True}, + ), + dcc.Markdown(second_step), + dbc.Row( + [ + dbc.Col( + [ + html.H4('Start point selection'), + dl.Map( + [dl.TileLayer(), dl.LayerGroup(id='stt-trip-layer-start')], + id='stt-trip-map-start', + style={'height': '500px'}, + center=[32.7, -96.8], + zoom=5, + ), + ] + ), + dbc.Col( + [ + html.H4('End point selection'), + dl.Map( + [dl.TileLayer(), dl.LayerGroup(id='stt-trip-layer-end')], + id='stt-trip-map-end', + style={'height': '500px'}, + center=[32.7, -96.8], + zoom=5, + ), + ] + ), + ] + ), + dbc.Row( + html.Div(id='message'), + ), + ] +) + + +def map_click(click_lat_lng, radius, circle_color): + # WARN: There seem to be a bug on click_lat_lng leaflet event, values can be out of bound (e.g: -357 for lat in eu), some kind of modulus might be required + # Couldn't reproduce it though + layer_children = [ + dl.Circle(center=click_lat_lng, radius=radius, color=circle_color) + ] + endpoint_coords = click_lat_lng + zoom = 13 + map_center = click_lat_lng + return layer_children, endpoint_coords, zoom, map_center + + +@callback( + Output('stt-trip-map-end', 'zoom', allow_duplicate=True), + Output('stt-trip-map-end', 'center', allow_duplicate=True), + Input('link-trip-time-start', 'data'), + State('stt-trip-map-end', 'zoom'), + State('stt-trip-map-end', 'center'), + State('link-trip-time-end', 'data'), + prevent_initial_call=True, +) +# This is optional, it's used on first selection to help user locate his endpoint +def center_end_map_helper(link_trip_time_start, end_zoom, end_center, link_trip_time_end): + if link_trip_time_end[0] == 0: + end_zoom = 13 + end_center = link_trip_time_start + return end_zoom, end_center + + +@callback( + Output('stt-trip-layer-start', 'children'), + Output('link-trip-time-start', 'data'), + Output('stt-trip-map-start', 'zoom'), + Output('stt-trip-map-start', 'center'), + Input('stt-trip-map-start', 'click_lat_lng'), + State('detection-radius', 'value'), + prevent_initial_call=True, +) +def map_start_click(click_lat_lng, radius): + return map_click(click_lat_lng, radius, "green") + + +@callback( + Output('stt-trip-layer-end', 'children'), + Output('link-trip-time-end', 'data'), + Output('stt-trip-map-end', 'zoom'), + Output('stt-trip-map-end', 'center'), + Input('stt-trip-map-end', 'click_lat_lng'), + State('detection-radius', 'value'), + prevent_initial_call=True, +) +def map_end_click(click_lat_lng, radius): + return map_click(click_lat_lng, radius, "red") + + +def format_duration_df(df, time_column_name='Time sample'): + df['Median time (minutes)'] = df.duration / 60 # convert seconds in minutes + df = df.reset_index().rename( + columns={ + 'start_fmt_time': time_column_name, + 'duration': 'Median time (seconds)', + 'section': 'Count', + 'mode': 'Mode', + } + ) + if time_column_name in df: + if 'Mode' in df: + df = df[ + [ + 'Mode', + time_column_name, + 'Median time (seconds)', + 'Median time (minutes)', + 'Count', + ] + ] # reorder cols + else: + df = df[ + [ + time_column_name, + 'Median time (seconds)', + 'Median time (minutes)', + 'Count', + ] + ] # reorder cols + else: + df = df[ + ['Mode', 'Median time (seconds)', 'Median time (minutes)', 'Count'] + ] # reorder cols + df = df.to_dict('records') # Format for display + return df + + +@callback( + Output('message', 'children'), + Input('link-trip-time-start', 'data'), + Input('link-trip-time-end', 'data'), + State('detection-radius', 'value'), + prevent_initial_call=True, +) +def generate_content_on_endpoints_change(link_trip_time_start, link_trip_time_end, radius): + if link_trip_time_end[0] == 0 or link_trip_time_start[0] == 0: + return '' + # logging.debug("link_trip_time_start: " + str(link_trip_time_start)) + # logging.debug("link_trip_time_end: " + str(link_trip_time_end)) + + # Warning: This is a database call, looks here if there is a performance hog. + # From initial tests, this seems to be performing well, without the need to do geoqueries in memory + df = db_utils.query_segments_crossing_endpoints( + link_trip_time_start[0], + link_trip_time_start[1], + link_trip_time_end[0], + link_trip_time_end[1], + radius, + ) + total_nb_trips = df.shape[0] + if total_nb_trips > 0: + modes = [e.name for e in ecwm.PredictedModeTypes] + # Warning: Another db call here. + # In theory, we could load all inferred_section modes in memory at start time, instead of fetching it everytime + # However, when testing it, the operation is quite heavy on the db and on ram. + # I opted for querying only sections we're interested in, every time. Page load is still decent, especially when the number of section is low. + mode_by_section_id = db_utils.query_inferred_sections_modes( + df['section'].to_list() + ) + df['mode'] = df['section'].apply( + lambda section_id: modes[mode_by_section_id[str(section_id)]] + ) + median_trip_time = df['duration'].median() + times = pd.to_datetime(df['start_fmt_time'], errors='coerce', utc=True) + duration_per_hour = format_duration_df( + df.groupby(times.dt.hour).agg({'duration': 'median', 'section': 'count'}), + time_column_name='Hour', + ) + duration_per_mode = format_duration_df( + df.groupby('mode').agg({'duration': 'median', 'section': 'count'}) + ) + duration_per_mode_per_hour = format_duration_df( + df.groupby(['mode', times.dt.hour]).agg( + {'duration': 'median', 'section': 'count'} + ), + time_column_name='Hour', + ) + duration_per_mode_per_month = format_duration_df( + df.groupby(['mode', times.dt.month]).agg( + {'duration': 'median', 'section': 'count'} + ), + time_column_name='Month', + ) + return dbc.Row( + [ + dbc.Col( + [ + html.Br(), + html.H3('Results'), + html.Div( + f'Computed median segment duration is {median_trip_time} seconds, {total_nb_trips} trips considered' + ), + html.Br(), + html.H4('Median segment duration by mode of transport'), + dash_table.DataTable( + id='duration_per_mode', + data=duration_per_mode, + sort_action='native', + sort_mode='multi', + export_format='csv', + ), + html.Br(), + html.H4( + 'Median segment duration by mode and hour of the day (UTC)' + ), + dash_table.DataTable( + id='duration_per_hour', + data=duration_per_hour, + sort_action='native', + sort_mode='multi', + export_format='csv', + ), + html.Br(), + html.H4( + 'Median segment duration by mode and hour of the day (UTC)' + ), + dash_table.DataTable( + id='duration_per_mode_per_hour', + data=duration_per_mode_per_hour, + sort_action='native', + sort_mode='multi', + export_format='csv', + ), + html.Br(), + html.H4('Median segment duration by mode and month'), + dash_table.DataTable( + id='duration_per_mode_per_month', + data=duration_per_mode_per_month, + sort_action='native', + sort_mode='multi', + export_format='csv', + ), + ], + xs=6, + ), + dbc.Col( + [ + html.Br(), + html.H3('Trips Data'), + dash_table.DataTable( + id='trips_data', + data=df[ + ['start_fmt_time', 'end_fmt_time', 'mode', 'duration'] + ].to_dict('records'), + page_size=15, + sort_action='native', + sort_mode='multi', + export_format='csv', + ), + ], + xs=6, + style={ + 'display': 'block' + if has_permission('segment_trip_time_full_trips') + else 'none' + }, + ), + ] + ) + return [html.H3('Results'), dcc.Markdown(not_enough_data_message)] + + +# This is left as an example on loading all inferred_section modes in memory at start time, instead of fetching it everytime +# This lead to poor memory performances on a larger db +# @callback( +# Output('store-mode-by-section-id', 'data'), +# Input('store-trips', 'data') # Only using this an initial trigger +# ) +# def load_mode_by_section_id(s): +# return db_utils.query_inferred_sections_modes() diff --git a/requirements.txt b/requirements.txt index 6f1bce8..cd6b59d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,12 @@ python-jose==3.3.0 flask==2.2.5 flask-talisman==1.0.0 dash_auth==2.0.0 +dash-leaflet==0.1.28 + +# emission requirements +arrow==1.2.3 +pymongo==4.4.1 +pandas==2.0.3 +future==0.18.3 +attrdict==2.0.1 +geojson==3.0.1 diff --git a/utils/db_utils.py b/utils/db_utils.py index c0deb42..658d9b0 100644 --- a/utils/db_utils.py +++ b/utils/db_utils.py @@ -3,6 +3,7 @@ from uuid import UUID import arrow +from bson.son import SON import pandas as pd import pymongo @@ -158,4 +159,87 @@ def add_user_stats(user_data): if last_call != -1: user['last_call'] = arrow.get(last_call).format(time_format) - return user_data \ No newline at end of file + return user_data + +def query_segments_crossing_endpoints(start_lat, start_long, end_lat, end_long, range_around_endpoints=400): + # data.loc only appears in analysis/recreated_location + query_start = {'data.loc': {'$near': SON([('$geometry', SON([('type', 'Point'), ('coordinates', [start_long, start_lat])])), ('$maxDistance', range_around_endpoints)])}} + query_end = {'data.loc': {'$near': SON([('$geometry', SON([('type', 'Point'), ('coordinates', [end_long, end_lat])])), ('$maxDistance', range_around_endpoints)])}} + + res_end = edb.get_analysis_timeseries_db().find(query_end) + end_by_section = {} + for elt in res_end: + elt_data = elt.get('data') + if elt_data: + section_id = elt_data.get('section') + # This makes sure that only the first encounter of the section is added + # The near query gives closest points first, so the first one is the one we want + if section_id not in end_by_section: + end_by_section[section_id] = elt + + res_start = edb.get_analysis_timeseries_db().find(query_start) + start_by_section = {} + for elt in res_start: + elt_data = elt.get('data') + if elt_data: + section_id = elt_data.get('section') + if section_id not in start_by_section: + start_by_section[section_id] = elt + + vals = [] + user_id_seen_dict = {} + number_user_seen = 0 + # Now we can read every section crossing start point + for section_id in start_by_section: + matching_start = start_by_section[section_id] + matching_start_data = matching_start.get('data') + if matching_start_data is None: + # Something is wrong with the fetched data, shouldn't happen + continue + if 'idx' not in matching_start_data: + # Sometimes, idx is missing in data, not sure why + continue + matching_end = end_by_section.get(section_id) + if matching_end is None: + # This section_id didn't cross the end point + continue + matching_end_data = matching_end.get('data', {}) + # idx allows us to check that the start section is crossed first, we do not care about trips going the other way + if 'idx' in matching_end_data and matching_start_data.get('idx') < matching_end_data.get('idx'): + user_id = str(start_by_section[section_id].get('user_id')) + if user_id_seen_dict.get(user_id) is None: + number_user_seen += 1 + user_id_seen_dict[user_id] = True + vals.append({ + 'start': start_by_section[section_id], + 'end': end_by_section[section_id], + 'duration': matching_end_data.get('ts') - matching_start_data.get('ts'), + 'section': section_id, + 'mode': matching_start_data.get('mode'), # Note: this is the mode given by the phone, not the computed one, we'll read it later from inferred_sections + 'start_fmt_time': matching_start_data.get('fmt_time'), + 'end_fmt_time': matching_end_data.get('fmt_time') + }) + if perm_utils.permissions.get("segment_trip_time_min_users", 0) <= number_user_seen: + return pd.DataFrame.from_dict(vals) + return pd.DataFrame.from_dict([]) + +# The following query can be called multiple times, so we extract index creation from the function +analysis_timeseries_db = edb.get_analysis_timeseries_db() +# In theory, adding an index on the cleaned_section should improve query performance +analysis_timeseries_db.create_index([('data.cleaned_section', pymongo.ASCENDING)]) + +# When sections isn't set, this fetches all inferred_section +# Otherwise, it filters on given section ids using '$in' +# Note: for performance reasons, it is not recommended to use '$in' a list bigger than ~100 values +# In our use case, this could happen on popular trips, but the delay is deemed acceptable +def query_inferred_sections_modes(sections=[]): + query = {'metadata.key': 'analysis/inferred_section'} + if len(sections) > 0: + query['data.cleaned_section'] = {'$in': sections} + res = analysis_timeseries_db.find(query, {'data.cleaned_section': 1, 'data.sensed_mode': 1}) + mode_by_section_id = {} + for elt in res: + elt_data = elt.get('data') + if elt_data: + mode_by_section_id[str(elt_data.get('cleaned_section'))] = elt_data.get('sensed_mode') or 0 + return mode_by_section_id From c56af0928799896b873f8b2822ebd5a16dc1164e Mon Sep 17 00:00:00 2001 From: TTalex Date: Fri, 11 Aug 2023 17:57:45 +0200 Subject: [PATCH 3/9] Fix typos --- README.md | 2 +- pages/segment_trip_time.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d383f4a..fbb9bee 100644 --- a/README.md +++ b/README.md @@ -135,7 +135,7 @@ specified in the following sections. ### Segment Trip Time Page - `segment_trip_time`: User can view this page. (default `true`) -- `segment_trip_time_full_trips`: User can see the table containing non-agregated data (default `true`) +- `segment_trip_time_full_trips`: User can see the table containing non-aggregated data (default `true`) - `segment_trip_time_min_users`: Minimal number of distinct users in data required to display anything (value is a number, default `0`). ### Push Notification Page diff --git a/pages/segment_trip_time.py b/pages/segment_trip_time.py index 8b38566..1d28dab 100644 --- a/pages/segment_trip_time.py +++ b/pages/segment_trip_time.py @@ -257,7 +257,7 @@ def generate_content_on_endpoints_change(link_trip_time_start, link_trip_time_en ), html.Br(), html.H4( - 'Median segment duration by mode and hour of the day (UTC)' + 'Median segment duration by hour of the day (UTC)' ), dash_table.DataTable( id='duration_per_hour', From 84f946d7bfb6ef2d928b6a5d70f524ef3b41e50a Mon Sep 17 00:00:00 2001 From: TTalex Date: Wed, 23 Aug 2023 16:05:55 +0200 Subject: [PATCH 4/9] Cleanup requirements --- requirements.txt | 8 -------- 1 file changed, 8 deletions(-) diff --git a/requirements.txt b/requirements.txt index cd6b59d..ffa57a4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,11 +15,3 @@ flask==2.2.5 flask-talisman==1.0.0 dash_auth==2.0.0 dash-leaflet==0.1.28 - -# emission requirements -arrow==1.2.3 -pymongo==4.4.1 -pandas==2.0.3 -future==0.18.3 -attrdict==2.0.1 -geojson==3.0.1 From 88c9665ff0a6f0863372e239ed8f038eadcb5770 Mon Sep 17 00:00:00 2001 From: TTalex Date: Wed, 23 Aug 2023 16:20:43 +0200 Subject: [PATCH 5/9] Remove db index creation (moved to emission core) --- utils/db_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/utils/db_utils.py b/utils/db_utils.py index 658d9b0..88c2eef 100644 --- a/utils/db_utils.py +++ b/utils/db_utils.py @@ -223,10 +223,8 @@ def query_segments_crossing_endpoints(start_lat, start_long, end_lat, end_long, return pd.DataFrame.from_dict(vals) return pd.DataFrame.from_dict([]) -# The following query can be called multiple times, so we extract index creation from the function +# The following query can be called multiple times, let's open db only once analysis_timeseries_db = edb.get_analysis_timeseries_db() -# In theory, adding an index on the cleaned_section should improve query performance -analysis_timeseries_db.create_index([('data.cleaned_section', pymongo.ASCENDING)]) # When sections isn't set, this fetches all inferred_section # Otherwise, it filters on given section ids using '$in' From 25e51792d786541ef547f821b8cc38f15567213d Mon Sep 17 00:00:00 2001 From: TTalex Date: Wed, 23 Aug 2023 16:49:00 +0200 Subject: [PATCH 6/9] Drop SON usage, makes for cleaner dicts --- utils/db_utils.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/utils/db_utils.py b/utils/db_utils.py index 88c2eef..f3a676d 100644 --- a/utils/db_utils.py +++ b/utils/db_utils.py @@ -3,7 +3,6 @@ from uuid import UUID import arrow -from bson.son import SON import pandas as pd import pymongo @@ -163,9 +162,28 @@ def add_user_stats(user_data): def query_segments_crossing_endpoints(start_lat, start_long, end_lat, end_long, range_around_endpoints=400): # data.loc only appears in analysis/recreated_location - query_start = {'data.loc': {'$near': SON([('$geometry', SON([('type', 'Point'), ('coordinates', [start_long, start_lat])])), ('$maxDistance', range_around_endpoints)])}} - query_end = {'data.loc': {'$near': SON([('$geometry', SON([('type', 'Point'), ('coordinates', [end_long, end_lat])])), ('$maxDistance', range_around_endpoints)])}} - + query_start = { + 'data.loc': { + '$near': { + '$geometry': { + 'type': 'Point', + 'coordinates': [start_long, start_lat] + }, + '$maxDistance': range_around_endpoints + } + } + } + query_end = { + 'data.loc': { + '$near': { + '$geometry': { + 'type': 'Point', + 'coordinates': [end_long, end_lat] + }, + '$maxDistance': range_around_endpoints + } + } + } res_end = edb.get_analysis_timeseries_db().find(query_end) end_by_section = {} for elt in res_end: From 59e0c9810e68981593afef224a345f5c38732227 Mon Sep 17 00:00:00 2001 From: TTalex Date: Tue, 29 Aug 2023 15:04:13 +0200 Subject: [PATCH 7/9] Dropped point and range selection, users now draw polygons for start and end zones Rewrote data fetching to use the geoquery sdk --- pages/segment_trip_time.py | 140 +++++++++++++++---------------------- requirements.txt | 2 +- utils/db_utils.py | 99 ++++++-------------------- 3 files changed, 79 insertions(+), 162 deletions(-) diff --git a/pages/segment_trip_time.py b/pages/segment_trip_time.py index 1d28dab..9a28f46 100644 --- a/pages/segment_trip_time.py +++ b/pages/segment_trip_time.py @@ -5,6 +5,7 @@ import emission.core.wrapper.modeprediction as ecwm import logging +import json from utils.permissions import has_permission, permissions from utils import db_utils @@ -13,69 +14,76 @@ intro = """ ## Segment average trip time -This page displays some statistics on average trip duration between two selected points. -""" +This page displays some statistics on average trip duration between two selected zones. -first_step = """ -### First, select a detection radius. -This is the range in meters around start and end point where GPS data can be detected, closest data is prioritized. +### Usage +Using the polygon or square tools on the maps' menu, draw the start (left map) and end (right map) zones to consider. -This should somewhat match the distance a vehicle can cross at the maximum allowed speed in 30 seconds (sample rate). +Data will then be fetched for trips crossing the start zone and then the end zone. -A bigger number means more trips will be considered, however it might find trips on the wrong road if roads are close enough. +Here are some tips on how to draw zones: +* Zones shouldn't cover more than one parallel road; otherwise, it is unclear which path the user took. +* A bigger zone will give more results, at the cost of lower accuracy in trip durations (the start point could be anywhere in the zone). +* For exhaustivity, zone length should somewhat match the distance a vehicle can cross at the maximum allowed speed in 30 seconds (sample rate). +* A smaller zone will give more accurate time results, but the number of trips might be too low to be significant. +* Zones can be moved and edited using the Edit layer menu, and they can be deleted with the Delete layer button. +* Please be advised that only the last added zone will be considered on each map. It is thus advised to delete existing zones before creating new ones. """ -second_step = """ -### Then, select start and end points -These can be anywhere on the map, but are usually on road intersections. - -A circle will be shown with the detection radius set on the previous step. For accurate results, the circle should not cover more than one parallel road. -""" not_enough_data_message = f""" Not enough segments could be found between endpoints. This means that the number of recorded trips going from start to end point is too low. * There could be data, but on an insufficient number of users, breaking anonymity (minimum number of users is currently set to {permissions.get('segment_trip_time_min_users', 0)}) -* You could try to increase the detection radius, or chose different start and end points. +* You could try to increase the zone sizes, or chose different start and end points. """ +initial_maps_center = [32.7, -96.8] +initial_maps_zoom = 5 layout = html.Div( [ - dcc.Store(id='link-trip-time-start', data=(0, 0)), - dcc.Store(id='link-trip-time-end', data=(0, 0)), - # dcc.Store(id='store-mode-by-section-id', data={}), + dcc.Store(id='link-trip-time-start', data=json.dumps({"features": []})), + dcc.Store(id='link-trip-time-end', data=json.dumps({"features": []})), dcc.Markdown(intro), - dcc.Markdown(first_step), - dcc.Slider( - 0, - 2500, - id='detection-radius', - value=200, - tooltip={"placement": "bottom", "always_visible": True}, - ), - dcc.Markdown(second_step), dbc.Row( [ dbc.Col( [ - html.H4('Start point selection'), + html.H4('Start zone selection'), dl.Map( - [dl.TileLayer(), dl.LayerGroup(id='stt-trip-layer-start')], + [ + dl.TileLayer(), + dl.FeatureGroup([ + dl.EditControl( + id="stt-edit-control-start", + draw=dict(circle=False, marker=False, polyline=False, circlemarker=False) + ) + ]) + ], + #[dl.TileLayer(), dl.LayerGroup(id='stt-trip-layer-start')], id='stt-trip-map-start', style={'height': '500px'}, - center=[32.7, -96.8], - zoom=5, + center=initial_maps_center, + zoom=initial_maps_zoom ), ] ), dbc.Col( [ - html.H4('End point selection'), + html.H4('End zone selection'), dl.Map( - [dl.TileLayer(), dl.LayerGroup(id='stt-trip-layer-end')], + [ + dl.TileLayer(), + dl.FeatureGroup([ + dl.EditControl( + id="stt-edit-control-end", + draw=dict(circle=False, marker=False, polyline=False, circlemarker=False) + ) + ]) + ], id='stt-trip-map-end', style={'height': '500px'}, - center=[32.7, -96.8], - zoom=5, + center=initial_maps_center, + zoom=initial_maps_zoom ), ] ), @@ -88,59 +96,23 @@ ) -def map_click(click_lat_lng, radius, circle_color): - # WARN: There seem to be a bug on click_lat_lng leaflet event, values can be out of bound (e.g: -357 for lat in eu), some kind of modulus might be required - # Couldn't reproduce it though - layer_children = [ - dl.Circle(center=click_lat_lng, radius=radius, color=circle_color) - ] - endpoint_coords = click_lat_lng - zoom = 13 - map_center = click_lat_lng - return layer_children, endpoint_coords, zoom, map_center - - -@callback( - Output('stt-trip-map-end', 'zoom', allow_duplicate=True), - Output('stt-trip-map-end', 'center', allow_duplicate=True), - Input('link-trip-time-start', 'data'), - State('stt-trip-map-end', 'zoom'), - State('stt-trip-map-end', 'center'), - State('link-trip-time-end', 'data'), - prevent_initial_call=True, -) -# This is optional, it's used on first selection to help user locate his endpoint -def center_end_map_helper(link_trip_time_start, end_zoom, end_center, link_trip_time_end): - if link_trip_time_end[0] == 0: - end_zoom = 13 - end_center = link_trip_time_start - return end_zoom, end_center - @callback( - Output('stt-trip-layer-start', 'children'), Output('link-trip-time-start', 'data'), - Output('stt-trip-map-start', 'zoom'), - Output('stt-trip-map-start', 'center'), - Input('stt-trip-map-start', 'click_lat_lng'), - State('detection-radius', 'value'), + Input('stt-edit-control-start', 'geojson'), prevent_initial_call=True, ) -def map_start_click(click_lat_lng, radius): - return map_click(click_lat_lng, radius, "green") - +def map_start_draw(geojson): + return json.dumps(geojson) @callback( - Output('stt-trip-layer-end', 'children'), Output('link-trip-time-end', 'data'), - Output('stt-trip-map-end', 'zoom'), - Output('stt-trip-map-end', 'center'), - Input('stt-trip-map-end', 'click_lat_lng'), - State('detection-radius', 'value'), + Input('stt-edit-control-end', 'geojson'), prevent_initial_call=True, ) -def map_end_click(click_lat_lng, radius): - return map_click(click_lat_lng, radius, "red") +def map_end_draw(geojson): + return json.dumps(geojson) + def format_duration_df(df, time_column_name='Time sample'): @@ -185,11 +157,12 @@ def format_duration_df(df, time_column_name='Time sample'): Output('message', 'children'), Input('link-trip-time-start', 'data'), Input('link-trip-time-end', 'data'), - State('detection-radius', 'value'), prevent_initial_call=True, ) -def generate_content_on_endpoints_change(link_trip_time_start, link_trip_time_end, radius): - if link_trip_time_end[0] == 0 or link_trip_time_start[0] == 0: +def generate_content_on_endpoints_change(link_trip_time_start_str, link_trip_time_end_str): + link_trip_time_start = json.loads(link_trip_time_start_str) + link_trip_time_end = json.loads(link_trip_time_end_str) + if len(link_trip_time_end["features"]) == 0 or len(link_trip_time_start["features"]) == 0: return '' # logging.debug("link_trip_time_start: " + str(link_trip_time_start)) # logging.debug("link_trip_time_end: " + str(link_trip_time_end)) @@ -197,11 +170,8 @@ def generate_content_on_endpoints_change(link_trip_time_start, link_trip_time_en # Warning: This is a database call, looks here if there is a performance hog. # From initial tests, this seems to be performing well, without the need to do geoqueries in memory df = db_utils.query_segments_crossing_endpoints( - link_trip_time_start[0], - link_trip_time_start[1], - link_trip_time_end[0], - link_trip_time_end[1], - radius, + link_trip_time_start["features"][len(link_trip_time_start["features"])-1], + link_trip_time_end["features"][len(link_trip_time_end["features"])-1], ) total_nb_trips = df.shape[0] if total_nb_trips > 0: diff --git a/requirements.txt b/requirements.txt index ffa57a4..61cf274 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,4 @@ python-jose==3.3.0 flask==2.2.5 flask-talisman==1.0.0 dash_auth==2.0.0 -dash-leaflet==0.1.28 +dash-leaflet==1.0.7 diff --git a/utils/db_utils.py b/utils/db_utils.py index f3a676d..14985a8 100644 --- a/utils/db_utils.py +++ b/utils/db_utils.py @@ -9,7 +9,9 @@ import emission.core.get_database as edb import emission.storage.timeseries.abstract_timeseries as esta +import emission.storage.timeseries.aggregate_timeseries as estag import emission.storage.timeseries.timequery as estt +import emission.storage.timeseries.geoquery as estg from utils import constants from utils import permissions as perm_utils @@ -160,85 +162,30 @@ def add_user_stats(user_data): return user_data -def query_segments_crossing_endpoints(start_lat, start_long, end_lat, end_long, range_around_endpoints=400): - # data.loc only appears in analysis/recreated_location - query_start = { - 'data.loc': { - '$near': { - '$geometry': { - 'type': 'Point', - 'coordinates': [start_long, start_lat] - }, - '$maxDistance': range_around_endpoints - } - } - } - query_end = { - 'data.loc': { - '$near': { - '$geometry': { - 'type': 'Point', - 'coordinates': [end_long, end_lat] - }, - '$maxDistance': range_around_endpoints - } - } - } - res_end = edb.get_analysis_timeseries_db().find(query_end) - end_by_section = {} - for elt in res_end: - elt_data = elt.get('data') - if elt_data: - section_id = elt_data.get('section') - # This makes sure that only the first encounter of the section is added - # The near query gives closest points first, so the first one is the one we want - if section_id not in end_by_section: - end_by_section[section_id] = elt +def query_segments_crossing_endpoints(poly_region_start, poly_region_end): + agg_ts = estag.AggregateTimeSeries().get_aggregate_time_series() + + locs_matching_start = agg_ts.get_data_df("analysis/recreated_location", geo_query = estg.GeoQuery(['data.loc'], poly_region_start)) + locs_matching_start = locs_matching_start.drop_duplicates(subset=['section']) + if locs_matching_start.empty: + return locs_matching_start - res_start = edb.get_analysis_timeseries_db().find(query_start) - start_by_section = {} - for elt in res_start: - elt_data = elt.get('data') - if elt_data: - section_id = elt_data.get('section') - if section_id not in start_by_section: - start_by_section[section_id] = elt + locs_matching_end = agg_ts.get_data_df("analysis/recreated_location", geo_query = estg.GeoQuery(['data.loc'], poly_region_end)) + locs_matching_end = locs_matching_end.drop_duplicates(subset=['section']) + if locs_matching_end.empty: + return locs_matching_end - vals = [] - user_id_seen_dict = {} - number_user_seen = 0 - # Now we can read every section crossing start point - for section_id in start_by_section: - matching_start = start_by_section[section_id] - matching_start_data = matching_start.get('data') - if matching_start_data is None: - # Something is wrong with the fetched data, shouldn't happen - continue - if 'idx' not in matching_start_data: - # Sometimes, idx is missing in data, not sure why - continue - matching_end = end_by_section.get(section_id) - if matching_end is None: - # This section_id didn't cross the end point - continue - matching_end_data = matching_end.get('data', {}) - # idx allows us to check that the start section is crossed first, we do not care about trips going the other way - if 'idx' in matching_end_data and matching_start_data.get('idx') < matching_end_data.get('idx'): - user_id = str(start_by_section[section_id].get('user_id')) - if user_id_seen_dict.get(user_id) is None: - number_user_seen += 1 - user_id_seen_dict[user_id] = True - vals.append({ - 'start': start_by_section[section_id], - 'end': end_by_section[section_id], - 'duration': matching_end_data.get('ts') - matching_start_data.get('ts'), - 'section': section_id, - 'mode': matching_start_data.get('mode'), # Note: this is the mode given by the phone, not the computed one, we'll read it later from inferred_sections - 'start_fmt_time': matching_start_data.get('fmt_time'), - 'end_fmt_time': matching_end_data.get('fmt_time') - }) + merged = locs_matching_start.merge(locs_matching_end, how='outer', on=['section']) + filtered = merged.loc[merged['idx_x'] Date: Tue, 29 Aug 2023 16:32:24 +0200 Subject: [PATCH 8/9] Segment trip time: also use sdk to fetch sensed modes --- pages/segment_trip_time.py | 16 ++-------------- utils/db_utils.py | 25 ++++++++++++------------- 2 files changed, 14 insertions(+), 27 deletions(-) diff --git a/pages/segment_trip_time.py b/pages/segment_trip_time.py index 9a28f46..e9518bc 100644 --- a/pages/segment_trip_time.py +++ b/pages/segment_trip_time.py @@ -3,7 +3,6 @@ import dash_leaflet as dl import pandas as pd -import emission.core.wrapper.modeprediction as ecwm import logging import json @@ -175,16 +174,15 @@ def generate_content_on_endpoints_change(link_trip_time_start_str, link_trip_tim ) total_nb_trips = df.shape[0] if total_nb_trips > 0: - modes = [e.name for e in ecwm.PredictedModeTypes] # Warning: Another db call here. # In theory, we could load all inferred_section modes in memory at start time, instead of fetching it everytime # However, when testing it, the operation is quite heavy on the db and on ram. # I opted for querying only sections we're interested in, every time. Page load is still decent, especially when the number of section is low. mode_by_section_id = db_utils.query_inferred_sections_modes( - df['section'].to_list() + df[['section', 'user_id']].to_dict('records') ) df['mode'] = df['section'].apply( - lambda section_id: modes[mode_by_section_id[str(section_id)]] + lambda section_id: mode_by_section_id[str(section_id)].name ) median_trip_time = df['duration'].median() times = pd.to_datetime(df['start_fmt_time'], errors='coerce', utc=True) @@ -284,13 +282,3 @@ def generate_content_on_endpoints_change(link_trip_time_start_str, link_trip_tim ] ) return [html.H3('Results'), dcc.Markdown(not_enough_data_message)] - - -# This is left as an example on loading all inferred_section modes in memory at start time, instead of fetching it everytime -# This lead to poor memory performances on a larger db -# @callback( -# Output('store-mode-by-section-id', 'data'), -# Input('store-trips', 'data') # Only using this an initial trigger -# ) -# def load_mode_by_section_id(s): -# return db_utils.query_inferred_sections_modes() diff --git a/utils/db_utils.py b/utils/db_utils.py index 14985a8..923b0f3 100644 --- a/utils/db_utils.py +++ b/utils/db_utils.py @@ -12,6 +12,8 @@ import emission.storage.timeseries.aggregate_timeseries as estag import emission.storage.timeseries.timequery as estt import emission.storage.timeseries.geoquery as estg +import emission.storage.decorations.section_queries as esds +import emission.core.wrapper.modeprediction as ecwm from utils import constants from utils import permissions as perm_utils @@ -181,6 +183,7 @@ def query_segments_crossing_endpoints(poly_region_start, poly_region_end): filtered['mode'] = filtered['mode_x'] filtered['start_fmt_time'] = filtered['fmt_time_x'] filtered['end_fmt_time'] = filtered['fmt_time_y'] + filtered['user_id'] = filtered['user_id_y'] number_user_seen = filtered.user_id_x.nunique() @@ -191,18 +194,14 @@ def query_segments_crossing_endpoints(poly_region_start, poly_region_end): # The following query can be called multiple times, let's open db only once analysis_timeseries_db = edb.get_analysis_timeseries_db() -# When sections isn't set, this fetches all inferred_section -# Otherwise, it filters on given section ids using '$in' -# Note: for performance reasons, it is not recommended to use '$in' a list bigger than ~100 values -# In our use case, this could happen on popular trips, but the delay is deemed acceptable -def query_inferred_sections_modes(sections=[]): - query = {'metadata.key': 'analysis/inferred_section'} - if len(sections) > 0: - query['data.cleaned_section'] = {'$in': sections} - res = analysis_timeseries_db.find(query, {'data.cleaned_section': 1, 'data.sensed_mode': 1}) +# Fetches sensed_mode for each section in a list +# sections format example: [{'section': ObjectId('648d02b227fd2bb6635414a0'), 'user_id': UUID('6d7edf29-8b3f-451b-8d66-984cb8dd8906')}] +def query_inferred_sections_modes(sections): mode_by_section_id = {} - for elt in res: - elt_data = elt.get('data') - if elt_data: - mode_by_section_id[str(elt_data.get('cleaned_section'))] = elt_data.get('sensed_mode') or 0 + for section in sections: + matching_inferred_section = esds.cleaned2inferred_section(section.get('user_id'), section.get('section')) + if matching_inferred_section is None: + mode_by_section_id[str(section.get('section'))] = ecwm.PredictedModeTypes.UNKNOWN + else: + mode_by_section_id[str(section.get('section'))] = matching_inferred_section.data.sensed_mode # PredictedModeTypes return mode_by_section_id From 1fea161d4b8ece20af099851ee66dda0794e43e4 Mon Sep 17 00:00:00 2001 From: TTalex Date: Mon, 2 Oct 2023 11:11:51 +0200 Subject: [PATCH 9/9] Use new cleaned2inferred_section_list interface --- utils/db_utils.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/utils/db_utils.py b/utils/db_utils.py index 923b0f3..96a359c 100644 --- a/utils/db_utils.py +++ b/utils/db_utils.py @@ -197,11 +197,5 @@ def query_segments_crossing_endpoints(poly_region_start, poly_region_end): # Fetches sensed_mode for each section in a list # sections format example: [{'section': ObjectId('648d02b227fd2bb6635414a0'), 'user_id': UUID('6d7edf29-8b3f-451b-8d66-984cb8dd8906')}] def query_inferred_sections_modes(sections): - mode_by_section_id = {} - for section in sections: - matching_inferred_section = esds.cleaned2inferred_section(section.get('user_id'), section.get('section')) - if matching_inferred_section is None: - mode_by_section_id[str(section.get('section'))] = ecwm.PredictedModeTypes.UNKNOWN - else: - mode_by_section_id[str(section.get('section'))] = matching_inferred_section.data.sensed_mode # PredictedModeTypes - return mode_by_section_id + return esds.cleaned2inferred_section_list(sections) +