From 6ec2c3ef31c67a1c973b032aa1c552ba7e76a130 Mon Sep 17 00:00:00 2001
From: akgom <132290469+akgom@users.noreply.github.com>
Date: Tue, 25 Jun 2024 11:47:27 -0400
Subject: [PATCH 1/6] Update mongodb_connector.py

---
 maps/mongodb_connector.py | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)
diff --git a/maps/mongodb_connector.py b/maps/mongodb_connector.py
index ecae917..fbc67d6 100644
--- a/maps/mongodb_connector.py
+++ b/maps/mongodb_connector.py
@@ -1,13 +1,12 @@
 import pymongo as pm
-from nomic import AtlasProject
+from nomic import atlas
 from sentence_transformers import SentenceTransformer
 import numpy as np
 import pandas as pd
 from pathlib import Path
-import nomic
 
-# replace with your mongodb connect string / cert
-client = pm.MongoClient('mongodb+srv://cluster0.l3jhqfs.mongodb.net/'
+# Replace with your MongoDB connection string and certificate file path
+client = pm.MongoClient('mongodb+srv://<username>:<password>@cluster0.l3jhqfs.mongodb.net/testdb'
                         '?authSource=%24external&authMechanism=MONGODB-X509&retryWrites=true&w=majority',
                         tls=True,
                         tlsCertificateKeyFile='mongocert.pem')
@@ -17,20 +16,28 @@
 # Delete current content of collection
 collection.delete_many({})
 
-# Load embedding data into mongodb
+# Load embedding data into MongoDB
 mongo_so = pd.read_parquet(Path.cwd() / 'data' / 'mongo-so.parquet')
+
+# Initialize SentenceTransformer model
 model = SentenceTransformer('all-MiniLM-L6-v2')
-title_embeds = model.encode(mongo_so['title'])
+
+# Encode titles into embeddings
+title_embeds = model.encode(mongo_so['title'].tolist())
+
+# Assign embeddings to DataFrame
 mso_te = mongo_so.assign(title_embedding=list(title_embeds))
 
-data = list(r._asdict() for r in mso_te.itertuples())
+# Convert DataFrame to list of dictionaries for MongoDB insertion
+data = mso_te.to_dict(orient='records')
 for d in data:
     del d['Index']
     d['title_embedding'] = d['title_embedding'].tolist()
-data[0]
+
+# Insert data into MongoDB collection
 collection.insert_many(data)
 
-# Read a mongodb collection with embeddings in it and map it:
+# Read MongoDB collection with embeddings and map it using AtlasProject
 project = AtlasProject(
     name='MongoDB Stack Overflow Questions',
     unique_id_field='mongo_id',
@@ -39,20 +46,28 @@
     modality='embedding',
 )
 
+# Retrieve all items from MongoDB collection
 all_items = list(collection.find())
+
+# Extract embeddings into numpy array
 embs = np.array([d['title_embedding'] for d in all_items])
+
+# Prepare items for AtlasProject by converting _id to mongo_id and removing embeddings
 for d in all_items:
     d['mongo_id'] = str(d['_id'])
     del d['title_embedding']
     del d['_id']
 
+# Add embeddings to AtlasProject
 project.add_embeddings(all_items, embs)
 
+# Rebuild maps and create index for topic modeling
 project.rebuild_maps()
 project.create_index(
     name='MongoDB Stack Overflow Questions',
-    topic_label_field='body',
+    topic_label_field='body',  # Replace with appropriate field for topic modeling
     build_topic_model=True,
 )
 
+# Print information about the AtlasProject
 print(project)

From 3a49a0c2136db1a2c7bc24a9bd3c6f3d25c336e5 Mon Sep 17 00:00:00 2001
From: akgom <132290469+akgom@users.noreply.github.com>
Date: Wed, 26 Jun 2024 11:59:05 -0400
Subject: [PATCH 2/6] Update mongodb_connector.py

---
 maps/mongodb_connector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maps/mongodb_connector.py b/maps/mongodb_connector.py
index fbc67d6..1d9c9a7 100644
--- a/maps/mongodb_connector.py
+++ b/maps/mongodb_connector.py
@@ -1,5 +1,5 @@
 import pymongo as pm
-from nomic import atlas
+from nomic import atlasdataset
 from sentence_transformers import SentenceTransformer
 import numpy as np
 import pandas as pd

From 6ff56451ed4d720fc22dc8d91546fada10cd4dc5 Mon Sep 17 00:00:00 2001
From: akgom <132290469+akgom@users.noreply.github.com>
Date: Wed, 26 Jun 2024 15:13:38 -0400
Subject: [PATCH 3/6] Update mongodb_connector.py

---
 maps/mongodb_connector.py | 43 ++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/maps/mongodb_connector.py b/maps/mongodb_connector.py
index 1d9c9a7..b0bb487 100644
--- a/maps/mongodb_connector.py
+++ b/maps/mongodb_connector.py
@@ -1,5 +1,5 @@
 import pymongo as pm
-from nomic import atlasdataset
+from nomic import AtlasDataset
 from sentence_transformers import SentenceTransformer
 import numpy as np
 import pandas as pd
@@ -37,13 +37,12 @@
 # Insert data into MongoDB collection
 collection.insert_many(data)
 
-# Read MongoDB collection with embeddings and map it using AtlasProject
-project = AtlasProject(
-    name='MongoDB Stack Overflow Questions',
-    unique_id_field='mongo_id',
-    reset_project_if_exists=True,
+# Read MongoDB collection with embeddings and map it using AtlasDataset
+dataset = AtlasDataset(
+    "MongoDB_StackOverflow_Questions",
+    unique_id_field="mongo_id",
+    reset_dataset_if_exists=True,
     is_public=True,
-    modality='embedding',
 )
 
 # Retrieve all items from MongoDB collection
@@ -52,22 +51,24 @@
 # Extract embeddings into numpy array
 embs = np.array([d['title_embedding'] for d in all_items])
 
-# Prepare items for AtlasProject by converting _id to mongo_id and removing embeddings
+# Prepare items for AtlasDataset by converting _id to mongo_id and removing embeddings
 for d in all_items:
     d['mongo_id'] = str(d['_id'])
     del d['title_embedding']
     del d['_id']
 
-# Add embeddings to AtlasProject
-project.add_embeddings(all_items, embs)
-
-# Rebuild maps and create index for topic modeling
-project.rebuild_maps()
-project.create_index(
-    name='MongoDB Stack Overflow Questions',
-    topic_label_field='body',  # Replace with appropriate field for topic modeling
-    build_topic_model=True,
-)
-
-# Print information about the AtlasProject
-print(project)
+# Add embeddings to AtlasDataset
+dataset.add_data(data=all_items, embeddings=embs)
+
+# Create index in the dataset
+index_options = {
+    "indexed_field": "title",  # Replace with appropriate field for indexing
+    "modality": "embedding",
+    "topic_model": True,
+    "duplicate_detection": True,
+    "embedding_model": "NomicEmbed",
+}
+dataset.create_index(name="MongoDB_StackOverflow_Questions", **index_options)
+
+# Print information about the AtlasDataset
+print(dataset)

From d58b9f63740bf4df2abe4f260425447e32b23072 Mon Sep 17 00:00:00 2001
From: akgom <132290469+akgom@users.noreply.github.com>
Date: Thu, 27 Jun 2024 09:28:14 -0400
Subject: [PATCH 4/6] Update mongodb_connector.py

---
 maps/mongodb_connector.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/maps/mongodb_connector.py b/maps/mongodb_connector.py
index b0bb487..8532455 100644
--- a/maps/mongodb_connector.py
+++ b/maps/mongodb_connector.py
@@ -41,7 +41,6 @@
 dataset = AtlasDataset(
     "MongoDB_StackOverflow_Questions",
     unique_id_field="mongo_id",
-    reset_dataset_if_exists=True,
     is_public=True,
 )
 
@@ -68,7 +67,7 @@
     "duplicate_detection": True,
     "embedding_model": "NomicEmbed",
 }
-dataset.create_index(name="MongoDB_StackOverflow_Questions", **index_options)
+dataset.create_index(**index_options)
 
 # Print information about the AtlasDataset
 print(dataset)

From dea0dd53c9ecbe5b4d1fb89f0d2aa5c9d1f8f5b2 Mon Sep 17 00:00:00 2001
From: akgom <132290469+akgom@users.noreply.github.com>
Date: Thu, 27 Jun 2024 11:44:56 -0400
Subject: [PATCH 5/6] Update mongodb_connector.py

---
 maps/mongodb_connector.py | 50 +++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/maps/mongodb_connector.py b/maps/mongodb_connector.py
index 8532455..85753ad 100644
--- a/maps/mongodb_connector.py
+++ b/maps/mongodb_connector.py
@@ -1,46 +1,50 @@
 import pymongo as pm
-from nomic import AtlasDataset
-from sentence_transformers import SentenceTransformer
-import numpy as np
 import pandas as pd
 from pathlib import Path
+from nomic import AtlasDataset, embed
+import numpy as np
 
 # Replace with your MongoDB connection string and certificate file path
-client = pm.MongoClient('mongodb+srv://<username>:<password>@cluster0.l3jhqfs.mongodb.net/testdb'
-                        '?authSource=%24external&authMechanism=MONGODB-X509&retryWrites=true&w=majority',
-                        tls=True,
-                        tlsCertificateKeyFile='mongocert.pem')
+client = pm.MongoClient(
+    'mongodb+srv://<username>:<password>@cluster0.l3jhqfs.mongodb.net/testdb'
+    '?authSource=%24external&authMechanism=MONGODB-X509&retryWrites=true&w=majority',
+    tls=True,
+    tlsCertificateKeyFile='mongocert.pem'
+)
 
-collection = client.testdb.testcoll
+# Access or create MongoDB collection
+db = client.testdb
+collection = db.mongo_so
 
-# Delete current content of collection
+# Clear existing content in collection
 collection.delete_many({})
 
-# Load embedding data into MongoDB
+# Load data into DataFrame
 mongo_so = pd.read_parquet(Path.cwd() / 'data' / 'mongo-so.parquet')
 
-# Initialize SentenceTransformer model
-model = SentenceTransformer('all-MiniLM-L6-v2')
+# Initialize Nomic text embedding model
+output = embed.text(
+    texts=mongo_so['title'].tolist(),
+    model='nomic-embed-text-v1.5',
+    inference_mode='local',  # Use local inference
+)
 
-# Encode titles into embeddings
-title_embeds = model.encode(mongo_so['title'].tolist())
+# Extract embeddings
+title_embeds = output['embeddings']
 
 # Assign embeddings to DataFrame
-mso_te = mongo_so.assign(title_embedding=list(title_embeds))
+mongo_so['title_embedding'] = title_embeds
 
 # Convert DataFrame to list of dictionaries for MongoDB insertion
-data = mso_te.to_dict(orient='records')
-for d in data:
-    del d['Index']
-    d['title_embedding'] = d['title_embedding'].tolist()
+data = mongo_so.to_dict(orient='records')
 
 # Insert data into MongoDB collection
 collection.insert_many(data)
 
-# Read MongoDB collection with embeddings and map it using AtlasDataset
+# Initialize AtlasDataset for mapped data
 dataset = AtlasDataset(
     "MongoDB_StackOverflow_Questions",
-    unique_id_field="mongo_id",
+    unique_id_field="mongo_id",  # Replace with appropriate unique identifier field
     is_public=True,
 )
 
@@ -56,7 +60,7 @@
     del d['title_embedding']
     del d['_id']
 
-# Add embeddings to AtlasDataset
+# Add data and embeddings to AtlasDataset
 dataset.add_data(data=all_items, embeddings=embs)
 
 # Create index in the dataset
@@ -65,7 +69,7 @@
     "modality": "embedding",
     "topic_model": True,
     "duplicate_detection": True,
-    "embedding_model": "NomicEmbed",
+    "embedding_model": "nomic-embed-text-v1.5",  # Specify Nomic embedding model
 }
 dataset.create_index(**index_options)
 

From 45731213283f7399a91bc73d10b90b29165cd720 Mon Sep 17 00:00:00 2001
From: akgom <132290469+akgom@users.noreply.github.com>
Date: Mon, 8 Jul 2024 11:28:02 -0400
Subject: [PATCH 6/6] Update mongodb_connector.py

---
 maps/mongodb_connector.py | 105 +++++++++++++++++++++-----------------
 1 file changed, 57 insertions(+), 48 deletions(-)

diff --git a/maps/mongodb_connector.py b/maps/mongodb_connector.py
index 85753ad..4278b66 100644
--- a/maps/mongodb_connector.py
+++ b/maps/mongodb_connector.py
@@ -1,39 +1,34 @@
 import pymongo as pm
+import nomic
+from nomic import AtlasDataset
+from sentence_transformers import SentenceTransformer
+from pymongo.mongo_client import MongoClient
+import numpy as np
 import pandas as pd
 from pathlib import Path
-from nomic import AtlasDataset, embed
-import numpy as np
 
-# Replace with your MongoDB connection string and certificate file path
-client = pm.MongoClient(
-    'mongodb+srv://<username>:<password>@cluster0.l3jhqfs.mongodb.net/testdb'
-    '?authSource=%24external&authMechanism=MONGODB-X509&retryWrites=true&w=majority',
-    tls=True,
-    tlsCertificateKeyFile='mongocert.pem'
-)
+# MongoDB connection string 
+client = MongoClient('mongodb+srv://<USERNAME>:<PASSWORD>@<APPNAME>.1fy6rp1.mongodb.net/?appName=<APPNAME>',
+                     tls=True)
+
+# Replace with your actual API key
+nomic.login('YOUR_KEY_HERE')
 
-# Access or create MongoDB collection
-db = client.testdb
-collection = db.mongo_so
+# MongoDB collection
+collection = client.sample_mflix.comments
 
-# Clear existing content in collection
+# Delete current content of collection
 collection.delete_many({})
 
-# Load data into DataFrame
+# Load embedding data into MongoDB from parquet file
 mongo_so = pd.read_parquet(Path.cwd() / 'data' / 'mongo-so.parquet')
 
-# Initialize Nomic text embedding model
-output = embed.text(
-    texts=mongo_so['title'].tolist(),
-    model='nomic-embed-text-v1.5',
-    inference_mode='local',  # Use local inference
-)
-
-# Extract embeddings
-title_embeds = output['embeddings']
+# Initialize SentenceTransformer model
+model = SentenceTransformer('all-MiniLM-L6-v2')
 
-# Assign embeddings to DataFrame
-mongo_so['title_embedding'] = title_embeds
+# Encode titles using SentenceTransformer
+title_embeds = model.encode(mongo_so['title'].tolist())
+mongo_so['title_embedding'] = list(title_embeds)
 
 # Convert DataFrame to list of dictionaries for MongoDB insertion
 data = mongo_so.to_dict(orient='records')
@@ -41,37 +36,51 @@
 # Insert data into MongoDB collection
 collection.insert_many(data)
 
-# Initialize AtlasDataset for mapped data
-dataset = AtlasDataset(
-    "MongoDB_StackOverflow_Questions",
-    unique_id_field="mongo_id",  # Replace with appropriate unique identifier field
-    is_public=True,
-)
-
-# Retrieve all items from MongoDB collection
+# Fetch all items from MongoDB collection
 all_items = list(collection.find())
 
 # Extract embeddings into numpy array
 embs = np.array([d['title_embedding'] for d in all_items])
 
-# Prepare items for AtlasDataset by converting _id to mongo_id and removing embeddings
+# Remove 'title_embedding' field from each item, and convert '_id' to string
 for d in all_items:
-    d['mongo_id'] = str(d['_id'])
+    d['_id'] = str(d['_id'])
     del d['title_embedding']
-    del d['_id']
 
-# Add data and embeddings to AtlasDataset
+# Create an AtlasDataset instance
+dataset = AtlasDataset(
+    identifier='sample-mflix-comments',  # Unique identifier for your dataset
+    description='MongoDB Movie Comments',
+    unique_id_field='_id',
+    is_public=True
+)
+
+# Add data and embeddings to the AtlasDataset
 dataset.add_data(data=all_items, embeddings=embs)
 
-# Create index in the dataset
-index_options = {
-    "indexed_field": "title",  # Replace with appropriate field for indexing
-    "modality": "embedding",
-    "topic_model": True,
-    "duplicate_detection": True,
-    "embedding_model": "nomic-embed-text-v1.5",  # Specify Nomic embedding model
-}
-dataset.create_index(**index_options)
-
-# Print information about the AtlasDataset
+# Create an index and map
+dataset.create_index(
+    name='MongoDB Movie Comments',
+    indexed_field='body',  # Replace with your topic label field
+    modality='embedding',
+    topic_model={
+        'build_topic_model': True,
+        'topic_label_field': 'body'  # Replace with the field used for topic labeling
+    },
+    duplicate_detection={
+        'tag_duplicates': True,
+        'duplicate_cutoff': 0.95  # Adjust as needed
+    },
+    projection={
+        'n_neighbors': 15,  # Example value, adjust as needed
+        'n_epochs': 100,  # Example value, adjust as needed
+        'model': 'nomic-project-v2',
+        'local_neighborhood_size': 30,
+        'spread': 1.0,
+        'rho': 0.5
+    },
+    embedding_model='NomicEmbed'  # Specify the embedding model if needed
+)
+
+# Print the dataset to confirm
 print(dataset)