From 849bab43d68164df9c0b8c796598cc2d6e9e3b2e Mon Sep 17 00:00:00 2001
From: TTalex <alex.bourreau@free.fr>
Date: Fri, 11 Aug 2023 16:10:01 +0200
Subject: [PATCH 1/9] Small UTC date tweaks to avoid problems on utc positive
 tz

---
 pages/home.py     | 2 +-
 utils/db_utils.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pages/home.py b/pages/home.py
index 3aee59a..1b27f5d 100644
--- a/pages/home.py
+++ b/pages/home.py
@@ -64,7 +64,7 @@ def compute_sign_up_trend(uuid_df):
 
 
 def compute_trips_trend(trips_df, date_col):
-    trips_df[date_col] = pd.to_datetime(trips_df[date_col], utc=True)
+    trips_df[date_col] = pd.to_datetime(trips_df[date_col], utc=True, format='ISO8601')
     trips_df[date_col] = pd.DatetimeIndex(trips_df[date_col]).date
     res_df = (
         trips_df
diff --git a/utils/db_utils.py b/utils/db_utils.py
index 1500633..c0deb42 100644
--- a/utils/db_utils.py
+++ b/utils/db_utils.py
@@ -47,7 +47,7 @@ def query_uuids(start_date, end_date):
     return df
 
 def query_confirmed_trips(start_date, end_date):
-    start_ts, end_ts = None, datetime.max.timestamp()
+    start_ts, end_ts = None, datetime.max.replace(tzinfo=timezone.utc).timestamp()
     if start_date is not None:
         start_ts = datetime.combine(start_date, datetime.min.time()).timestamp()
 

From 286ce3feadc986e56b116cdc49a6bd91952dea02 Mon Sep 17 00:00:00 2001
From: TTalex <alex.bourreau@free.fr>
Date: Fri, 11 Aug 2023 16:53:58 +0200
Subject: [PATCH 2/9] Feature: Added segment trip time page

---
 README.md                  |   7 +-
 app_sidebar_collapsible.py |  10 ++
 pages/segment_trip_time.py | 326 +++++++++++++++++++++++++++++++++++++
 requirements.txt           |   9 +
 utils/db_utils.py          |  86 +++++++++-
 5 files changed, 436 insertions(+), 2 deletions(-)
 create mode 100644 pages/segment_trip_time.py

diff --git a/README.md b/README.md
index fb4be7a..d383f4a 100644
--- a/README.md
+++ b/README.md
@@ -45,7 +45,7 @@ pip install -r dashboard/requirements.txt
 Run the app:
 
 ```
-python app.py
+python app_sidebar_collapsible.py
 ```
 You can run the app on your browser at http://127.0.0.1:8050
 
@@ -133,6 +133,11 @@ specified in the following sections.
 - `map_bubble`: User can view the bubble map in the Map page.
 - `map_trip_lines`: User can view the trip lines map in the Map page.
 
+### Segment Trip Time Page
+- `segment_trip_time`: User can view this page. (default `true`)
+- `segment_trip_time_full_trips`: User can see the table containing non-agregated data (default `true`)
+- `segment_trip_time_min_users`: Minimal number of distinct users in data required to display anything (value is a number, default `0`).
+
 ### Push Notification Page
 - `push_send`: User can send push notifications in the Push Notification page.
 
diff --git a/app_sidebar_collapsible.py b/app_sidebar_collapsible.py
index 64077db..a3abd9d 100644
--- a/app_sidebar_collapsible.py
+++ b/app_sidebar_collapsible.py
@@ -99,6 +99,15 @@
                     href=dash.get_relative_path("/map"),
                     active="exact",
                 ),
+                dbc.NavLink(
+                    [
+                        html.I(className="fas fa-solid fa-hourglass me-2"),
+                        html.Span("Segment trip time"),
+                    ],
+                    href=dash.get_relative_path("/segment_trip_time"),
+                    active="exact",
+                    style={'display': 'block' if has_permission('segment_trip_time') else 'none'},
+                ),
                 dbc.NavLink(
                     [
                         html.I(className="fas fa-solid fa-envelope-open-text me-2"),
@@ -186,6 +195,7 @@ def update_store_uuids(start_date, end_date):
     return store
 
 
+# Note: this triggers twice on load, not great with a slow db
 @app.callback(
     Output("store-trips", "data"),
     Input('date-picker', 'start_date'),
diff --git a/pages/segment_trip_time.py b/pages/segment_trip_time.py
new file mode 100644
index 0000000..8b38566
--- /dev/null
+++ b/pages/segment_trip_time.py
@@ -0,0 +1,326 @@
+from dash import dcc, html, Input, Output, State, callback, register_page, dash_table
+import dash_bootstrap_components as dbc
+import dash_leaflet as dl
+import pandas as pd
+
+import emission.core.wrapper.modeprediction as ecwm
+import logging
+
+from utils.permissions import has_permission, permissions
+from utils import db_utils
+
+register_page(__name__, path="/segment_trip_time")
+
+intro = """
+## Segment average trip time
+This page displays some statistics on average trip duration between two selected points.
+"""
+
+first_step = """
+### First, select a detection radius.
+This is the range in meters around start and end point where GPS data can be detected, closest data is prioritized.
+
+This should somewhat match the distance a vehicle can cross at the maximum allowed speed in 30 seconds (sample rate).
+
+A bigger number means more trips will be considered, however it might find trips on the wrong road if roads are close enough.
+"""
+
+second_step = """
+### Then, select start and end points
+These can be anywhere on the map, but are usually on road intersections.
+
+A circle will be shown with the detection radius set on the previous step. For accurate results, the circle should not cover more than one parallel road.
+"""
+
+not_enough_data_message = f"""
+Not enough segments could be found between endpoints. This means that the number of recorded trips going from start to end point is too low. 
+* There could be data, but on an insufficient number of users, breaking anonymity (minimum number of users is currently set to {permissions.get('segment_trip_time_min_users', 0)})
+* You could try to increase the detection radius, or chose different start and end points.
+"""
+
+layout = html.Div(
+    [
+        dcc.Store(id='link-trip-time-start', data=(0, 0)),
+        dcc.Store(id='link-trip-time-end', data=(0, 0)),
+        # dcc.Store(id='store-mode-by-section-id', data={}),
+        dcc.Markdown(intro),
+        dcc.Markdown(first_step),
+        dcc.Slider(
+            0,
+            2500,
+            id='detection-radius',
+            value=200,
+            tooltip={"placement": "bottom", "always_visible": True},
+        ),
+        dcc.Markdown(second_step),
+        dbc.Row(
+            [
+                dbc.Col(
+                    [
+                        html.H4('Start point selection'),
+                        dl.Map(
+                            [dl.TileLayer(), dl.LayerGroup(id='stt-trip-layer-start')],
+                            id='stt-trip-map-start',
+                            style={'height': '500px'},
+                            center=[32.7, -96.8],
+                            zoom=5,
+                        ),
+                    ]
+                ),
+                dbc.Col(
+                    [
+                        html.H4('End point selection'),
+                        dl.Map(
+                            [dl.TileLayer(), dl.LayerGroup(id='stt-trip-layer-end')],
+                            id='stt-trip-map-end',
+                            style={'height': '500px'},
+                            center=[32.7, -96.8],
+                            zoom=5,
+                        ),
+                    ]
+                ),
+            ]
+        ),
+        dbc.Row(
+            html.Div(id='message'),
+        ),
+    ]
+)
+
+
+def map_click(click_lat_lng, radius, circle_color):
+    # WARN: There seem to be a bug on click_lat_lng leaflet event, values can be out of bound (e.g: -357 for lat in eu), some kind of modulus might be required
+    # Couldn't reproduce it though
+    layer_children = [
+        dl.Circle(center=click_lat_lng, radius=radius, color=circle_color)
+    ]
+    endpoint_coords = click_lat_lng
+    zoom = 13
+    map_center = click_lat_lng
+    return layer_children, endpoint_coords, zoom, map_center
+
+
+@callback(
+    Output('stt-trip-map-end', 'zoom', allow_duplicate=True),
+    Output('stt-trip-map-end', 'center', allow_duplicate=True),
+    Input('link-trip-time-start', 'data'),
+    State('stt-trip-map-end', 'zoom'),
+    State('stt-trip-map-end', 'center'),
+    State('link-trip-time-end', 'data'),
+    prevent_initial_call=True,
+)
+# This is optional, it's used on first selection to help user locate his endpoint
+def center_end_map_helper(link_trip_time_start, end_zoom, end_center, link_trip_time_end):
+    if link_trip_time_end[0] == 0:
+        end_zoom = 13
+        end_center = link_trip_time_start
+    return end_zoom, end_center
+
+
+@callback(
+    Output('stt-trip-layer-start', 'children'),
+    Output('link-trip-time-start', 'data'),
+    Output('stt-trip-map-start', 'zoom'),
+    Output('stt-trip-map-start', 'center'),
+    Input('stt-trip-map-start', 'click_lat_lng'),
+    State('detection-radius', 'value'),
+    prevent_initial_call=True,
+)
+def map_start_click(click_lat_lng, radius):
+    return map_click(click_lat_lng, radius, "green")
+
+
+@callback(
+    Output('stt-trip-layer-end', 'children'),
+    Output('link-trip-time-end', 'data'),
+    Output('stt-trip-map-end', 'zoom'),
+    Output('stt-trip-map-end', 'center'),
+    Input('stt-trip-map-end', 'click_lat_lng'),
+    State('detection-radius', 'value'),
+    prevent_initial_call=True,
+)
+def map_end_click(click_lat_lng, radius):
+    return map_click(click_lat_lng, radius, "red")
+
+
+def format_duration_df(df, time_column_name='Time sample'):
+    df['Median time (minutes)'] = df.duration / 60  # convert seconds in minutes
+    df = df.reset_index().rename(
+        columns={
+            'start_fmt_time': time_column_name,
+            'duration': 'Median time (seconds)',
+            'section': 'Count',
+            'mode': 'Mode',
+        }
+    )
+    if time_column_name in df:
+        if 'Mode' in df:
+            df = df[
+                [
+                    'Mode',
+                    time_column_name,
+                    'Median time (seconds)',
+                    'Median time (minutes)',
+                    'Count',
+                ]
+            ]  # reorder cols
+        else:
+            df = df[
+                [
+                    time_column_name,
+                    'Median time (seconds)',
+                    'Median time (minutes)',
+                    'Count',
+                ]
+            ]  # reorder cols
+    else:
+        df = df[
+            ['Mode', 'Median time (seconds)', 'Median time (minutes)', 'Count']
+        ]  # reorder cols
+    df = df.to_dict('records')  # Format for display
+    return df
+
+
+@callback(
+    Output('message', 'children'),
+    Input('link-trip-time-start', 'data'),
+    Input('link-trip-time-end', 'data'),
+    State('detection-radius', 'value'),
+    prevent_initial_call=True,
+)
+def generate_content_on_endpoints_change(link_trip_time_start, link_trip_time_end, radius):
+    if link_trip_time_end[0] == 0 or link_trip_time_start[0] == 0:
+        return ''
+    # logging.debug("link_trip_time_start: " + str(link_trip_time_start))
+    # logging.debug("link_trip_time_end: " + str(link_trip_time_end))
+
+    # Warning: This is a database call, looks here if there is a performance hog.
+    # From initial tests, this seems to be performing well, without the need to do geoqueries in memory
+    df = db_utils.query_segments_crossing_endpoints(
+        link_trip_time_start[0],
+        link_trip_time_start[1],
+        link_trip_time_end[0],
+        link_trip_time_end[1],
+        radius,
+    )
+    total_nb_trips = df.shape[0]
+    if total_nb_trips > 0:
+        modes = [e.name for e in ecwm.PredictedModeTypes]
+        # Warning: Another db call here.
+        # In theory, we could load all inferred_section modes in memory at start time, instead of fetching it everytime
+        # However, when testing it, the operation is quite heavy on the db and on ram.
+        # I opted for querying only sections we're interested in, every time. Page load is still decent, especially when the number of section is low.
+        mode_by_section_id = db_utils.query_inferred_sections_modes(
+            df['section'].to_list()
+        )
+        df['mode'] = df['section'].apply(
+            lambda section_id: modes[mode_by_section_id[str(section_id)]]
+        )
+        median_trip_time = df['duration'].median()
+        times = pd.to_datetime(df['start_fmt_time'], errors='coerce', utc=True)
+        duration_per_hour = format_duration_df(
+            df.groupby(times.dt.hour).agg({'duration': 'median', 'section': 'count'}),
+            time_column_name='Hour',
+        )
+        duration_per_mode = format_duration_df(
+            df.groupby('mode').agg({'duration': 'median', 'section': 'count'})
+        )
+        duration_per_mode_per_hour = format_duration_df(
+            df.groupby(['mode', times.dt.hour]).agg(
+                {'duration': 'median', 'section': 'count'}
+            ),
+            time_column_name='Hour',
+        )
+        duration_per_mode_per_month = format_duration_df(
+            df.groupby(['mode', times.dt.month]).agg(
+                {'duration': 'median', 'section': 'count'}
+            ),
+            time_column_name='Month',
+        )
+        return dbc.Row(
+            [
+                dbc.Col(
+                    [
+                        html.Br(),
+                        html.H3('Results'),
+                        html.Div(
+                            f'Computed median segment duration is {median_trip_time} seconds, {total_nb_trips} trips considered'
+                        ),
+                        html.Br(),
+                        html.H4('Median segment duration by mode of transport'),
+                        dash_table.DataTable(
+                            id='duration_per_mode',
+                            data=duration_per_mode,
+                            sort_action='native',
+                            sort_mode='multi',
+                            export_format='csv',
+                        ),
+                        html.Br(),
+                        html.H4(
+                            'Median segment duration by mode and hour of the day (UTC)'
+                        ),
+                        dash_table.DataTable(
+                            id='duration_per_hour',
+                            data=duration_per_hour,
+                            sort_action='native',
+                            sort_mode='multi',
+                            export_format='csv',
+                        ),
+                        html.Br(),
+                        html.H4(
+                            'Median segment duration by mode and hour of the day (UTC)'
+                        ),
+                        dash_table.DataTable(
+                            id='duration_per_mode_per_hour',
+                            data=duration_per_mode_per_hour,
+                            sort_action='native',
+                            sort_mode='multi',
+                            export_format='csv',
+                        ),
+                        html.Br(),
+                        html.H4('Median segment duration by mode and month'),
+                        dash_table.DataTable(
+                            id='duration_per_mode_per_month',
+                            data=duration_per_mode_per_month,
+                            sort_action='native',
+                            sort_mode='multi',
+                            export_format='csv',
+                        ),
+                    ],
+                    xs=6,
+                ),
+                dbc.Col(
+                    [
+                        html.Br(),
+                        html.H3('Trips Data'),
+                        dash_table.DataTable(
+                            id='trips_data',
+                            data=df[
+                                ['start_fmt_time', 'end_fmt_time', 'mode', 'duration']
+                            ].to_dict('records'),
+                            page_size=15,
+                            sort_action='native',
+                            sort_mode='multi',
+                            export_format='csv',
+                        ),
+                    ],
+                    xs=6,
+                    style={
+                        'display': 'block'
+                        if has_permission('segment_trip_time_full_trips')
+                        else 'none'
+                    },
+                ),
+            ]
+        )
+    return [html.H3('Results'), dcc.Markdown(not_enough_data_message)]
+
+
+# This is left as an example on loading all inferred_section modes in memory at start time, instead of fetching it everytime
+# This lead to poor memory performances on a larger db
+# @callback(
+#     Output('store-mode-by-section-id', 'data'),
+#     Input('store-trips', 'data') # Only using this an initial trigger
+# )
+# def load_mode_by_section_id(s):
+#     return db_utils.query_inferred_sections_modes()
diff --git a/requirements.txt b/requirements.txt
index 6f1bce8..cd6b59d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,3 +14,12 @@ python-jose==3.3.0
 flask==2.2.5
 flask-talisman==1.0.0
 dash_auth==2.0.0
+dash-leaflet==0.1.28
+
+# emission requirements
+arrow==1.2.3
+pymongo==4.4.1
+pandas==2.0.3
+future==0.18.3
+attrdict==2.0.1
+geojson==3.0.1
diff --git a/utils/db_utils.py b/utils/db_utils.py
index c0deb42..658d9b0 100644
--- a/utils/db_utils.py
+++ b/utils/db_utils.py
@@ -3,6 +3,7 @@
 from uuid import UUID
 
 import arrow
+from bson.son import SON
 
 import pandas as pd
 import pymongo
@@ -158,4 +159,87 @@ def add_user_stats(user_data):
             if last_call != -1:
                 user['last_call'] = arrow.get(last_call).format(time_format)
 
-    return user_data
\ No newline at end of file
+    return user_data
+
+def query_segments_crossing_endpoints(start_lat, start_long, end_lat, end_long, range_around_endpoints=400):
+    # data.loc only appears in analysis/recreated_location
+    query_start = {'data.loc': {'$near': SON([('$geometry', SON([('type', 'Point'), ('coordinates', [start_long, start_lat])])), ('$maxDistance', range_around_endpoints)])}}
+    query_end = {'data.loc': {'$near': SON([('$geometry', SON([('type', 'Point'), ('coordinates', [end_long, end_lat])])), ('$maxDistance', range_around_endpoints)])}}
+    
+    res_end = edb.get_analysis_timeseries_db().find(query_end)
+    end_by_section = {}
+    for elt in res_end:
+        elt_data = elt.get('data')
+        if elt_data:
+            section_id = elt_data.get('section')
+            # This makes sure that only the first encounter of the section is added
+            # The near query gives closest points first, so the first one is the one we want
+            if section_id not in end_by_section:
+                end_by_section[section_id] = elt
+    
+    res_start = edb.get_analysis_timeseries_db().find(query_start)
+    start_by_section = {}
+    for elt in res_start:
+        elt_data = elt.get('data')
+        if elt_data:
+            section_id = elt_data.get('section')
+            if section_id not in start_by_section:
+                start_by_section[section_id] = elt
+    
+    vals = []
+    user_id_seen_dict = {}
+    number_user_seen = 0
+    # Now we can read every section crossing start point
+    for section_id in start_by_section:
+        matching_start = start_by_section[section_id]
+        matching_start_data = matching_start.get('data')
+        if matching_start_data is None:
+            # Something is wrong with the fetched data, shouldn't happen
+            continue
+        if 'idx' not in matching_start_data:
+            # Sometimes, idx is missing in data, not sure why
+            continue
+        matching_end = end_by_section.get(section_id)
+        if matching_end is None:
+            # This section_id didn't cross the end point
+            continue
+        matching_end_data = matching_end.get('data', {})
+        # idx allows us to check that the start section is crossed first, we do not care about trips going the other way
+        if 'idx' in matching_end_data and matching_start_data.get('idx') < matching_end_data.get('idx'):
+            user_id = str(start_by_section[section_id].get('user_id'))
+            if user_id_seen_dict.get(user_id) is None:
+                number_user_seen += 1
+                user_id_seen_dict[user_id] = True
+            vals.append({
+                'start': start_by_section[section_id], 
+                'end': end_by_section[section_id], 
+                'duration': matching_end_data.get('ts') - matching_start_data.get('ts'),
+                'section': section_id, 
+                'mode': matching_start_data.get('mode'), # Note: this is the mode given by the phone, not the computed one, we'll read it later from inferred_sections
+                'start_fmt_time': matching_start_data.get('fmt_time'),
+                'end_fmt_time': matching_end_data.get('fmt_time')
+            })
+    if perm_utils.permissions.get("segment_trip_time_min_users", 0) <= number_user_seen:
+        return pd.DataFrame.from_dict(vals)
+    return pd.DataFrame.from_dict([])
+
+# The following query can be called multiple times, so we extract index creation from the function
+analysis_timeseries_db = edb.get_analysis_timeseries_db()
+# In theory, adding an index on the cleaned_section should improve query performance
+analysis_timeseries_db.create_index([('data.cleaned_section', pymongo.ASCENDING)])
+
+# When sections isn't set, this fetches all inferred_section
+# Otherwise, it filters on given section ids using '$in'
+# Note: for performance reasons, it is not recommended to use '$in' a list bigger than ~100 values
+# In our use case, this could happen on popular trips, but the delay is deemed acceptable
+def query_inferred_sections_modes(sections=[]):
+    query = {'metadata.key': 'analysis/inferred_section'}
+    if len(sections) > 0:
+        query['data.cleaned_section'] = {'$in': sections} 
+    res = analysis_timeseries_db.find(query, {'data.cleaned_section': 1, 'data.sensed_mode': 1})
+    mode_by_section_id = {}
+    for elt in res:
+        elt_data = elt.get('data')
+        if elt_data:
+            mode_by_section_id[str(elt_data.get('cleaned_section'))] = elt_data.get('sensed_mode') or 0
+    return mode_by_section_id

From c56af0928799896b873f8b2822ebd5a16dc1164e Mon Sep 17 00:00:00 2001
From: TTalex <alex.bourreau@free.fr>
Date: Fri, 11 Aug 2023 17:57:45 +0200
Subject: [PATCH 3/9] Fix typos

---
 README.md                  | 2 +-
 pages/segment_trip_time.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d383f4a..fbb9bee 100644
--- a/README.md
+++ b/README.md
@@ -135,7 +135,7 @@ specified in the following sections.
 
 ### Segment Trip Time Page
 - `segment_trip_time`: User can view this page. (default `true`)
-- `segment_trip_time_full_trips`: User can see the table containing non-agregated data (default `true`)
+- `segment_trip_time_full_trips`: User can see the table containing non-aggregated data (default `true`)
 - `segment_trip_time_min_users`: Minimal number of distinct users in data required to display anything (value is a number, default `0`).
 
 ### Push Notification Page
diff --git a/pages/segment_trip_time.py b/pages/segment_trip_time.py
index 8b38566..1d28dab 100644
--- a/pages/segment_trip_time.py
+++ b/pages/segment_trip_time.py
@@ -257,7 +257,7 @@ def generate_content_on_endpoints_change(link_trip_time_start, link_trip_time_en
                         ),
                         html.Br(),
                         html.H4(
-                            'Median segment duration by mode and hour of the day (UTC)'
+                            'Median segment duration by hour of the day (UTC)'
                         ),
                         dash_table.DataTable(
                             id='duration_per_hour',

From 84f946d7bfb6ef2d928b6a5d70f524ef3b41e50a Mon Sep 17 00:00:00 2001
From: TTalex <alex.bourreau@free.fr>
Date: Wed, 23 Aug 2023 16:05:55 +0200
Subject: [PATCH 4/9] Cleanup requirements

---
 requirements.txt | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index cd6b59d..ffa57a4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,11 +15,3 @@ flask==2.2.5
 flask-talisman==1.0.0
 dash_auth==2.0.0
 dash-leaflet==0.1.28
-
-# emission requirements
-arrow==1.2.3
-pymongo==4.4.1
-pandas==2.0.3
-future==0.18.3
-attrdict==2.0.1
-geojson==3.0.1

From 88c9665ff0a6f0863372e239ed8f038eadcb5770 Mon Sep 17 00:00:00 2001
From: TTalex <alex.bourreau@free.fr>
Date: Wed, 23 Aug 2023 16:20:43 +0200
Subject: [PATCH 5/9] Remove db index creation (moved to emission core)

---
 utils/db_utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/utils/db_utils.py b/utils/db_utils.py
index 658d9b0..88c2eef 100644
--- a/utils/db_utils.py
+++ b/utils/db_utils.py
@@ -223,10 +223,8 @@ def query_segments_crossing_endpoints(start_lat, start_long, end_lat, end_long,
         return pd.DataFrame.from_dict(vals)
     return pd.DataFrame.from_dict([])
 
-# The following query can be called multiple times, so we extract index creation from the function
+# The following query can be called multiple times, let's open db only once
 analysis_timeseries_db = edb.get_analysis_timeseries_db()
-# In theory, adding an index on the cleaned_section should improve query performance
-analysis_timeseries_db.create_index([('data.cleaned_section', pymongo.ASCENDING)])
 
 # When sections isn't set, this fetches all inferred_section
 # Otherwise, it filters on given section ids using '$in'

From 25e51792d786541ef547f821b8cc38f15567213d Mon Sep 17 00:00:00 2001
From: TTalex <alex.bourreau@free.fr>
Date: Wed, 23 Aug 2023 16:49:00 +0200
Subject: [PATCH 6/9] Drop SON usage, makes for cleaner dicts

---
 utils/db_utils.py | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/utils/db_utils.py b/utils/db_utils.py
index 88c2eef..f3a676d 100644
--- a/utils/db_utils.py
+++ b/utils/db_utils.py
@@ -3,7 +3,6 @@
 from uuid import UUID
 
 import arrow
-from bson.son import SON
 
 import pandas as pd
 import pymongo
@@ -163,9 +162,28 @@ def add_user_stats(user_data):
 
 def query_segments_crossing_endpoints(start_lat, start_long, end_lat, end_long, range_around_endpoints=400):
     # data.loc only appears in analysis/recreated_location
-    query_start = {'data.loc': {'$near': SON([('$geometry', SON([('type', 'Point'), ('coordinates', [start_long, start_lat])])), ('$maxDistance', range_around_endpoints)])}}
-    query_end = {'data.loc': {'$near': SON([('$geometry', SON([('type', 'Point'), ('coordinates', [end_long, end_lat])])), ('$maxDistance', range_around_endpoints)])}}
-    
+    query_start = {
+        'data.loc': {
+            '$near': {
+                '$geometry': {
+                    'type': 'Point',
+                    'coordinates': [start_long, start_lat]
+                }, 
+                '$maxDistance': range_around_endpoints
+            }
+        }
+    }
+    query_end = {
+        'data.loc': {
+            '$near': {
+                '$geometry': {
+                    'type': 'Point',
+                    'coordinates': [end_long, end_lat]
+                }, 
+                '$maxDistance': range_around_endpoints
+            }
+        }
+    }
     res_end = edb.get_analysis_timeseries_db().find(query_end)
     end_by_section = {}
     for elt in res_end:

From 59e0c9810e68981593afef224a345f5c38732227 Mon Sep 17 00:00:00 2001
From: TTalex <alex.bourreau@free.fr>
Date: Tue, 29 Aug 2023 15:04:13 +0200
Subject: [PATCH 7/9] Dropped point and range selection, users now draw
 polygons for start and end zones Rewrote data fetching to use the geoquery
 sdk

---
 pages/segment_trip_time.py | 140 +++++++++++++++----------------------
 requirements.txt           |   2 +-
 utils/db_utils.py          |  99 ++++++--------------------
 3 files changed, 79 insertions(+), 162 deletions(-)

diff --git a/pages/segment_trip_time.py b/pages/segment_trip_time.py
index 1d28dab..9a28f46 100644
--- a/pages/segment_trip_time.py
+++ b/pages/segment_trip_time.py
@@ -5,6 +5,7 @@
 
 import emission.core.wrapper.modeprediction as ecwm
 import logging
+import json
 
 from utils.permissions import has_permission, permissions
 from utils import db_utils
@@ -13,69 +14,76 @@
 
 intro = """
 ## Segment average trip time
-This page displays some statistics on average trip duration between two selected points.
-"""
+This page displays some statistics on average trip duration between two selected zones.
 
-first_step = """
-### First, select a detection radius.
-This is the range in meters around start and end point where GPS data can be detected, closest data is prioritized.
+### Usage
+Using the polygon or square tools on the maps' menu, draw the start (left map) and end (right map) zones to consider.
 
-This should somewhat match the distance a vehicle can cross at the maximum allowed speed in 30 seconds (sample rate).
+Data will then be fetched for trips crossing the start zone and then the end zone.
 
-A bigger number means more trips will be considered, however it might find trips on the wrong road if roads are close enough.
+Here are some tips on how to draw zones:
+* Zones shouldn't cover more than one parallel road; otherwise, it is unclear which path the user took.
+* A bigger zone will give more results, at the cost of lower accuracy in trip durations (the start point could be anywhere in the zone).
+* For exhaustivity, zone length should somewhat match the distance a vehicle can cross at the maximum allowed speed in 30 seconds (sample rate).
+* A smaller zone will give more accurate time results, but the number of trips might be too low to be significant.
+* Zones can be moved and edited using the Edit layer menu, and they can be deleted with the Delete layer button.
+* Please be advised that only the last added zone will be considered on each map. It is thus advised to delete existing zones before creating new ones.
 """
 
-second_step = """
-### Then, select start and end points
-These can be anywhere on the map, but are usually on road intersections.
-
-A circle will be shown with the detection radius set on the previous step. For accurate results, the circle should not cover more than one parallel road.
-"""
 
 not_enough_data_message = f"""
 Not enough segments could be found between endpoints. This means that the number of recorded trips going from start to end point is too low. 
 * There could be data, but on an insufficient number of users, breaking anonymity (minimum number of users is currently set to {permissions.get('segment_trip_time_min_users', 0)})
-* You could try to increase the detection radius, or chose different start and end points.
+* You could try to increase the zone sizes, or chose different start and end points.
 """
 
+initial_maps_center = [32.7, -96.8]
+initial_maps_zoom = 5
 layout = html.Div(
     [
-        dcc.Store(id='link-trip-time-start', data=(0, 0)),
-        dcc.Store(id='link-trip-time-end', data=(0, 0)),
-        # dcc.Store(id='store-mode-by-section-id', data={}),
+        dcc.Store(id='link-trip-time-start', data=json.dumps({"features": []})),
+        dcc.Store(id='link-trip-time-end', data=json.dumps({"features": []})),
         dcc.Markdown(intro),
-        dcc.Markdown(first_step),
-        dcc.Slider(
-            0,
-            2500,
-            id='detection-radius',
-            value=200,
-            tooltip={"placement": "bottom", "always_visible": True},
-        ),
-        dcc.Markdown(second_step),
         dbc.Row(
             [
                 dbc.Col(
                     [
-                        html.H4('Start point selection'),
+                        html.H4('Start zone selection'),
                         dl.Map(
-                            [dl.TileLayer(), dl.LayerGroup(id='stt-trip-layer-start')],
+                            [
+                                dl.TileLayer(), 
+                                dl.FeatureGroup([
+                                    dl.EditControl(
+                                        id="stt-edit-control-start", 
+                                        draw=dict(circle=False, marker=False, polyline=False, circlemarker=False)
+                                    )
+                                ])
+                            ],
+                            #[dl.TileLayer(), dl.LayerGroup(id='stt-trip-layer-start')],
                             id='stt-trip-map-start',
                             style={'height': '500px'},
-                            center=[32.7, -96.8],
-                            zoom=5,
+                            center=initial_maps_center,
+                            zoom=initial_maps_zoom
                         ),
                     ]
                 ),
                 dbc.Col(
                     [
-                        html.H4('End point selection'),
+                        html.H4('End zone selection'),
                         dl.Map(
-                            [dl.TileLayer(), dl.LayerGroup(id='stt-trip-layer-end')],
+                            [
+                                dl.TileLayer(), 
+                                dl.FeatureGroup([
+                                    dl.EditControl(
+                                        id="stt-edit-control-end", 
+                                        draw=dict(circle=False, marker=False, polyline=False, circlemarker=False)
+                                    )
+                                ])
+                            ],
                             id='stt-trip-map-end',
                             style={'height': '500px'},
-                            center=[32.7, -96.8],
-                            zoom=5,
+                            center=initial_maps_center,
+                            zoom=initial_maps_zoom
                         ),
                     ]
                 ),
@@ -88,59 +96,23 @@
 )
 
 
-def map_click(click_lat_lng, radius, circle_color):
-    # WARN: There seem to be a bug on click_lat_lng leaflet event, values can be out of bound (e.g: -357 for lat in eu), some kind of modulus might be required
-    # Couldn't reproduce it though
-    layer_children = [
-        dl.Circle(center=click_lat_lng, radius=radius, color=circle_color)
-    ]
-    endpoint_coords = click_lat_lng
-    zoom = 13
-    map_center = click_lat_lng
-    return layer_children, endpoint_coords, zoom, map_center
-
-
-@callback(
-    Output('stt-trip-map-end', 'zoom', allow_duplicate=True),
-    Output('stt-trip-map-end', 'center', allow_duplicate=True),
-    Input('link-trip-time-start', 'data'),
-    State('stt-trip-map-end', 'zoom'),
-    State('stt-trip-map-end', 'center'),
-    State('link-trip-time-end', 'data'),
-    prevent_initial_call=True,
-)
-# This is optional, it's used on first selection to help user locate his endpoint
-def center_end_map_helper(link_trip_time_start, end_zoom, end_center, link_trip_time_end):
-    if link_trip_time_end[0] == 0:
-        end_zoom = 13
-        end_center = link_trip_time_start
-    return end_zoom, end_center
-
 
 @callback(
-    Output('stt-trip-layer-start', 'children'),
     Output('link-trip-time-start', 'data'),
-    Output('stt-trip-map-start', 'zoom'),
-    Output('stt-trip-map-start', 'center'),
-    Input('stt-trip-map-start', 'click_lat_lng'),
-    State('detection-radius', 'value'),
+    Input('stt-edit-control-start', 'geojson'),
     prevent_initial_call=True,
 )
-def map_start_click(click_lat_lng, radius):
-    return map_click(click_lat_lng, radius, "green")
-
+def map_start_draw(geojson):
+    return json.dumps(geojson)
 
 @callback(
-    Output('stt-trip-layer-end', 'children'),
     Output('link-trip-time-end', 'data'),
-    Output('stt-trip-map-end', 'zoom'),
-    Output('stt-trip-map-end', 'center'),
-    Input('stt-trip-map-end', 'click_lat_lng'),
-    State('detection-radius', 'value'),
+    Input('stt-edit-control-end', 'geojson'),
     prevent_initial_call=True,
 )
-def map_end_click(click_lat_lng, radius):
-    return map_click(click_lat_lng, radius, "red")
+def map_end_draw(geojson):
+    return json.dumps(geojson)
+
 
 
 def format_duration_df(df, time_column_name='Time sample'):
@@ -185,11 +157,12 @@ def format_duration_df(df, time_column_name='Time sample'):
     Output('message', 'children'),
     Input('link-trip-time-start', 'data'),
     Input('link-trip-time-end', 'data'),
-    State('detection-radius', 'value'),
     prevent_initial_call=True,
 )
-def generate_content_on_endpoints_change(link_trip_time_start, link_trip_time_end, radius):
-    if link_trip_time_end[0] == 0 or link_trip_time_start[0] == 0:
+def generate_content_on_endpoints_change(link_trip_time_start_str, link_trip_time_end_str):
+    link_trip_time_start = json.loads(link_trip_time_start_str)
+    link_trip_time_end = json.loads(link_trip_time_end_str)
+    if len(link_trip_time_end["features"]) == 0 or len(link_trip_time_start["features"]) == 0:
         return ''
     # logging.debug("link_trip_time_start: " + str(link_trip_time_start))
     # logging.debug("link_trip_time_end: " + str(link_trip_time_end))
@@ -197,11 +170,8 @@ def generate_content_on_endpoints_change(link_trip_time_start, link_trip_time_en
     # Warning: This is a database call, looks here if there is a performance hog.
     # From initial tests, this seems to be performing well, without the need to do geoqueries in memory
     df = db_utils.query_segments_crossing_endpoints(
-        link_trip_time_start[0],
-        link_trip_time_start[1],
-        link_trip_time_end[0],
-        link_trip_time_end[1],
-        radius,
+        link_trip_time_start["features"][len(link_trip_time_start["features"])-1],
+        link_trip_time_end["features"][len(link_trip_time_end["features"])-1],
     )
     total_nb_trips = df.shape[0]
     if total_nb_trips > 0:
diff --git a/requirements.txt b/requirements.txt
index ffa57a4..61cf274 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,4 +14,4 @@ python-jose==3.3.0
 flask==2.2.5
 flask-talisman==1.0.0
 dash_auth==2.0.0
-dash-leaflet==0.1.28
+dash-leaflet==1.0.7
diff --git a/utils/db_utils.py b/utils/db_utils.py
index f3a676d..14985a8 100644
--- a/utils/db_utils.py
+++ b/utils/db_utils.py
@@ -9,7 +9,9 @@
 
 import emission.core.get_database as edb
 import emission.storage.timeseries.abstract_timeseries as esta
+import emission.storage.timeseries.aggregate_timeseries as estag
 import emission.storage.timeseries.timequery as estt
+import emission.storage.timeseries.geoquery as estg
 
 from utils import constants
 from utils import permissions as perm_utils
@@ -160,85 +162,30 @@ def add_user_stats(user_data):
 
     return user_data
 
-def query_segments_crossing_endpoints(start_lat, start_long, end_lat, end_long, range_around_endpoints=400):
-    # data.loc only appears in analysis/recreated_location
-    query_start = {
-        'data.loc': {
-            '$near': {
-                '$geometry': {
-                    'type': 'Point',
-                    'coordinates': [start_long, start_lat]
-                }, 
-                '$maxDistance': range_around_endpoints
-            }
-        }
-    }
-    query_end = {
-        'data.loc': {
-            '$near': {
-                '$geometry': {
-                    'type': 'Point',
-                    'coordinates': [end_long, end_lat]
-                }, 
-                '$maxDistance': range_around_endpoints
-            }
-        }
-    }
-    res_end = edb.get_analysis_timeseries_db().find(query_end)
-    end_by_section = {}
-    for elt in res_end:
-        elt_data = elt.get('data')
-        if elt_data:
-            section_id = elt_data.get('section')
-            # This makes sure that only the first encounter of the section is added
-            # The near query gives closest points first, so the first one is the one we want
-            if section_id not in end_by_section:
-                end_by_section[section_id] = elt
+def query_segments_crossing_endpoints(poly_region_start, poly_region_end):
+    agg_ts = estag.AggregateTimeSeries().get_aggregate_time_series()
+
+    locs_matching_start = agg_ts.get_data_df("analysis/recreated_location", geo_query = estg.GeoQuery(['data.loc'], poly_region_start))
+    locs_matching_start = locs_matching_start.drop_duplicates(subset=['section'])
+    if locs_matching_start.empty:
+        return locs_matching_start
     
-    res_start = edb.get_analysis_timeseries_db().find(query_start)
-    start_by_section = {}
-    for elt in res_start:
-        elt_data = elt.get('data')
-        if elt_data:
-            section_id = elt_data.get('section')
-            if section_id not in start_by_section:
-                start_by_section[section_id] = elt
+    locs_matching_end = agg_ts.get_data_df("analysis/recreated_location", geo_query = estg.GeoQuery(['data.loc'], poly_region_end))
+    locs_matching_end = locs_matching_end.drop_duplicates(subset=['section'])
+    if locs_matching_end.empty:
+        return locs_matching_end
     
-    vals = []
-    user_id_seen_dict = {}
-    number_user_seen = 0
-    # Now we can read every section crossing start point
-    for section_id in start_by_section:
-        matching_start = start_by_section[section_id]
-        matching_start_data = matching_start.get('data')
-        if matching_start_data is None:
-            # Something is wrong with the fetched data, shouldn't happen
-            continue
-        if 'idx' not in matching_start_data:
-            # Sometimes, idx is missing in data, not sure why
-            continue
-        matching_end = end_by_section.get(section_id)
-        if matching_end is None:
-            # This section_id didn't cross the end point
-            continue
-        matching_end_data = matching_end.get('data', {})
-        # idx allows us to check that the start section is crossed first, we do not care about trips going the other way
-        if 'idx' in matching_end_data and matching_start_data.get('idx') < matching_end_data.get('idx'):
-            user_id = str(start_by_section[section_id].get('user_id'))
-            if user_id_seen_dict.get(user_id) is None:
-                number_user_seen += 1
-                user_id_seen_dict[user_id] = True
-            vals.append({
-                'start': start_by_section[section_id], 
-                'end': end_by_section[section_id], 
-                'duration': matching_end_data.get('ts') - matching_start_data.get('ts'),
-                'section': section_id, 
-                'mode': matching_start_data.get('mode'), # Note: this is the mode given by the phone, not the computed one, we'll read it later from inferred_sections
-                'start_fmt_time': matching_start_data.get('fmt_time'),
-                'end_fmt_time': matching_end_data.get('fmt_time')
-            })
+    merged = locs_matching_start.merge(locs_matching_end, how='outer', on=['section'])
+    filtered = merged.loc[merged['idx_x']<merged['idx_y']].copy()
+    filtered['duration'] = filtered['ts_y'] - filtered['ts_x']
+    filtered['mode'] = filtered['mode_x']
+    filtered['start_fmt_time'] = filtered['fmt_time_x']
+    filtered['end_fmt_time'] = filtered['fmt_time_y']
+    
+    number_user_seen = filtered.user_id_x.nunique()
+
     if perm_utils.permissions.get("segment_trip_time_min_users", 0) <= number_user_seen:
-        return pd.DataFrame.from_dict(vals)
+        return filtered
     return pd.DataFrame.from_dict([])
 
 # The following query can be called multiple times, let's open db only once

From 6cdf8e6d82c3adf3a969d5d276a5fb6886857a76 Mon Sep 17 00:00:00 2001
From: TTalex <alex.bourreau@free.fr>
Date: Tue, 29 Aug 2023 16:32:24 +0200
Subject: [PATCH 8/9] Segment trip time: also use sdk to fetch sensed modes

---
 pages/segment_trip_time.py | 16 ++--------------
 utils/db_utils.py          | 25 ++++++++++++-------------
 2 files changed, 14 insertions(+), 27 deletions(-)

diff --git a/pages/segment_trip_time.py b/pages/segment_trip_time.py
index 9a28f46..e9518bc 100644
--- a/pages/segment_trip_time.py
+++ b/pages/segment_trip_time.py
@@ -3,7 +3,6 @@
 import dash_leaflet as dl
 import pandas as pd
 
-import emission.core.wrapper.modeprediction as ecwm
 import logging
 import json
 
@@ -175,16 +174,15 @@ def generate_content_on_endpoints_change(link_trip_time_start_str, link_trip_tim
     )
     total_nb_trips = df.shape[0]
     if total_nb_trips > 0:
-        modes = [e.name for e in ecwm.PredictedModeTypes]
         # Warning: Another db call here.
         # In theory, we could load all inferred_section modes in memory at start time, instead of fetching it everytime
         # However, when testing it, the operation is quite heavy on the db and on ram.
         # I opted for querying only sections we're interested in, every time. Page load is still decent, especially when the number of section is low.
         mode_by_section_id = db_utils.query_inferred_sections_modes(
-            df['section'].to_list()
+            df[['section', 'user_id']].to_dict('records')
         )
         df['mode'] = df['section'].apply(
-            lambda section_id: modes[mode_by_section_id[str(section_id)]]
+            lambda section_id: mode_by_section_id[str(section_id)].name
         )
         median_trip_time = df['duration'].median()
         times = pd.to_datetime(df['start_fmt_time'], errors='coerce', utc=True)
@@ -284,13 +282,3 @@ def generate_content_on_endpoints_change(link_trip_time_start_str, link_trip_tim
             ]
         )
     return [html.H3('Results'), dcc.Markdown(not_enough_data_message)]
-
-
-# This is left as an example on loading all inferred_section modes in memory at start time, instead of fetching it everytime
-# This lead to poor memory performances on a larger db
-# @callback(
-#     Output('store-mode-by-section-id', 'data'),
-#     Input('store-trips', 'data') # Only using this an initial trigger
-# )
-# def load_mode_by_section_id(s):
-#     return db_utils.query_inferred_sections_modes()
diff --git a/utils/db_utils.py b/utils/db_utils.py
index 14985a8..923b0f3 100644
--- a/utils/db_utils.py
+++ b/utils/db_utils.py
@@ -12,6 +12,8 @@
 import emission.storage.timeseries.aggregate_timeseries as estag
 import emission.storage.timeseries.timequery as estt
 import emission.storage.timeseries.geoquery as estg
+import emission.storage.decorations.section_queries as esds
+import emission.core.wrapper.modeprediction as ecwm
 
 from utils import constants
 from utils import permissions as perm_utils
@@ -181,6 +183,7 @@ def query_segments_crossing_endpoints(poly_region_start, poly_region_end):
     filtered['mode'] = filtered['mode_x']
     filtered['start_fmt_time'] = filtered['fmt_time_x']
     filtered['end_fmt_time'] = filtered['fmt_time_y']
+    filtered['user_id'] = filtered['user_id_y']
     
     number_user_seen = filtered.user_id_x.nunique()
 
@@ -191,18 +194,14 @@ def query_segments_crossing_endpoints(poly_region_start, poly_region_end):
 # The following query can be called multiple times, let's open db only once
 analysis_timeseries_db = edb.get_analysis_timeseries_db()
 
-# When sections isn't set, this fetches all inferred_section
-# Otherwise, it filters on given section ids using '$in'
-# Note: for performance reasons, it is not recommended to use '$in' a list bigger than ~100 values
-# In our use case, this could happen on popular trips, but the delay is deemed acceptable
-def query_inferred_sections_modes(sections=[]):
-    query = {'metadata.key': 'analysis/inferred_section'}
-    if len(sections) > 0:
-        query['data.cleaned_section'] = {'$in': sections} 
-    res = analysis_timeseries_db.find(query, {'data.cleaned_section': 1, 'data.sensed_mode': 1})
+# Fetches sensed_mode for each section in a list
+# sections format example: [{'section': ObjectId('648d02b227fd2bb6635414a0'), 'user_id': UUID('6d7edf29-8b3f-451b-8d66-984cb8dd8906')}]
+def query_inferred_sections_modes(sections):
     mode_by_section_id = {}
-    for elt in res:
-        elt_data = elt.get('data')
-        if elt_data:
-            mode_by_section_id[str(elt_data.get('cleaned_section'))] = elt_data.get('sensed_mode') or 0
+    for section in sections:
+        matching_inferred_section = esds.cleaned2inferred_section(section.get('user_id'), section.get('section'))
+        if matching_inferred_section is None:
+            mode_by_section_id[str(section.get('section'))] = ecwm.PredictedModeTypes.UNKNOWN
+        else:
+            mode_by_section_id[str(section.get('section'))] = matching_inferred_section.data.sensed_mode # PredictedModeTypes
     return mode_by_section_id

From 1fea161d4b8ece20af099851ee66dda0794e43e4 Mon Sep 17 00:00:00 2001
From: TTalex <alex.bourreau@free.fr>
Date: Mon, 2 Oct 2023 11:11:51 +0200
Subject: [PATCH 9/9] Use new cleaned2inferred_section_list interface

---
 utils/db_utils.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/utils/db_utils.py b/utils/db_utils.py
index 923b0f3..96a359c 100644
--- a/utils/db_utils.py
+++ b/utils/db_utils.py
@@ -197,11 +197,5 @@ def query_segments_crossing_endpoints(poly_region_start, poly_region_end):
 # Fetches sensed_mode for each section in a list
 # sections format example: [{'section': ObjectId('648d02b227fd2bb6635414a0'), 'user_id': UUID('6d7edf29-8b3f-451b-8d66-984cb8dd8906')}]
 def query_inferred_sections_modes(sections):
-    mode_by_section_id = {}
-    for section in sections:
-        matching_inferred_section = esds.cleaned2inferred_section(section.get('user_id'), section.get('section'))
-        if matching_inferred_section is None:
-            mode_by_section_id[str(section.get('section'))] = ecwm.PredictedModeTypes.UNKNOWN
-        else:
-            mode_by_section_id[str(section.get('section'))] = matching_inferred_section.data.sensed_mode # PredictedModeTypes
-    return mode_by_section_id
+    return esds.cleaned2inferred_section_list(sections)
+