From 6ec2c3ef31c67a1c973b032aa1c552ba7e76a130 Mon Sep 17 00:00:00 2001 From: akgom <132290469+akgom@users.noreply.github.com> Date: Tue, 25 Jun 2024 11:47:27 -0400 Subject: [PATCH 1/6] Update mongodb_connector.py --- maps/mongodb_connector.py | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/maps/mongodb_connector.py b/maps/mongodb_connector.py index ecae917..fbc67d6 100644 --- a/maps/mongodb_connector.py +++ b/maps/mongodb_connector.py @@ -1,13 +1,12 @@ import pymongo as pm -from nomic import AtlasProject +from nomic import atlas from sentence_transformers import SentenceTransformer import numpy as np import pandas as pd from pathlib import Path -import nomic -# replace with your mongodb connect string / cert -client = pm.MongoClient('mongodb+srv://cluster0.l3jhqfs.mongodb.net/' +# Replace with your MongoDB connection string and certificate file path +client = pm.MongoClient('mongodb+srv://:@cluster0.l3jhqfs.mongodb.net/testdb' '?authSource=%24external&authMechanism=MONGODB-X509&retryWrites=true&w=majority', tls=True, tlsCertificateKeyFile='mongocert.pem') @@ -17,20 +16,28 @@ # Delete current content of collection collection.delete_many({}) -# Load embedding data into mongodb +# Load embedding data into MongoDB mongo_so = pd.read_parquet(Path.cwd() / 'data' / 'mongo-so.parquet') + +# Initialize SentenceTransformer model model = SentenceTransformer('all-MiniLM-L6-v2') -title_embeds = model.encode(mongo_so['title']) + +# Encode titles into embeddings +title_embeds = model.encode(mongo_so['title'].tolist()) + +# Assign embeddings to DataFrame mso_te = mongo_so.assign(title_embedding=list(title_embeds)) -data = list(r._asdict() for r in mso_te.itertuples()) +# Convert DataFrame to list of dictionaries for MongoDB insertion +data = mso_te.to_dict(orient='records') for d in data: del d['Index'] d['title_embedding'] = d['title_embedding'].tolist() -data[0] + +# Insert data into MongoDB collection collection.insert_many(data) -# Read a mongodb collection with embeddings in it and map it: +# Read MongoDB collection with embeddings and map it using AtlasProject project = AtlasProject( name='MongoDB Stack Overflow Questions', unique_id_field='mongo_id', @@ -39,20 +46,28 @@ modality='embedding', ) +# Retrieve all items from MongoDB collection all_items = list(collection.find()) + +# Extract embeddings into numpy array embs = np.array([d['title_embedding'] for d in all_items]) + +# Prepare items for AtlasProject by converting _id to mongo_id and removing embeddings for d in all_items: d['mongo_id'] = str(d['_id']) del d['title_embedding'] del d['_id'] +# Add embeddings to AtlasProject project.add_embeddings(all_items, embs) +# Rebuild maps and create index for topic modeling project.rebuild_maps() project.create_index( name='MongoDB Stack Overflow Questions', - topic_label_field='body', + topic_label_field='body', # Replace with appropriate field for topic modeling build_topic_model=True, ) +# Print information about the AtlasProject print(project) From 3a49a0c2136db1a2c7bc24a9bd3c6f3d25c336e5 Mon Sep 17 00:00:00 2001 From: akgom <132290469+akgom@users.noreply.github.com> Date: Wed, 26 Jun 2024 11:59:05 -0400 Subject: [PATCH 2/6] Update mongodb_connector.py --- maps/mongodb_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maps/mongodb_connector.py b/maps/mongodb_connector.py index fbc67d6..1d9c9a7 100644 --- a/maps/mongodb_connector.py +++ b/maps/mongodb_connector.py @@ -1,5 +1,5 @@ import pymongo as pm -from nomic import atlas +from nomic import atlasdataset from sentence_transformers import SentenceTransformer import numpy as np import pandas as pd From 6ff56451ed4d720fc22dc8d91546fada10cd4dc5 Mon Sep 17 00:00:00 2001 From: akgom <132290469+akgom@users.noreply.github.com> Date: Wed, 26 Jun 2024 15:13:38 -0400 Subject: [PATCH 3/6] Update mongodb_connector.py --- maps/mongodb_connector.py | 43 ++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/maps/mongodb_connector.py b/maps/mongodb_connector.py index 1d9c9a7..b0bb487 100644 --- a/maps/mongodb_connector.py +++ b/maps/mongodb_connector.py @@ -1,5 +1,5 @@ import pymongo as pm -from nomic import atlasdataset +from nomic import AtlasDataset from sentence_transformers import SentenceTransformer import numpy as np import pandas as pd @@ -37,13 +37,12 @@ # Insert data into MongoDB collection collection.insert_many(data) -# Read MongoDB collection with embeddings and map it using AtlasProject -project = AtlasProject( - name='MongoDB Stack Overflow Questions', - unique_id_field='mongo_id', - reset_project_if_exists=True, +# Read MongoDB collection with embeddings and map it using AtlasDataset +dataset = AtlasDataset( + "MongoDB_StackOverflow_Questions", + unique_id_field="mongo_id", + reset_dataset_if_exists=True, is_public=True, - modality='embedding', ) # Retrieve all items from MongoDB collection @@ -52,22 +51,24 @@ # Extract embeddings into numpy array embs = np.array([d['title_embedding'] for d in all_items]) -# Prepare items for AtlasProject by converting _id to mongo_id and removing embeddings +# Prepare items for AtlasDataset by converting _id to mongo_id and removing embeddings for d in all_items: d['mongo_id'] = str(d['_id']) del d['title_embedding'] del d['_id'] -# Add embeddings to AtlasProject -project.add_embeddings(all_items, embs) - -# Rebuild maps and create index for topic modeling -project.rebuild_maps() -project.create_index( - name='MongoDB Stack Overflow Questions', - topic_label_field='body', # Replace with appropriate field for topic modeling - build_topic_model=True, -) - -# Print information about the AtlasProject -print(project) +# Add embeddings to AtlasDataset +dataset.add_data(data=all_items, embeddings=embs) + +# Create index in the dataset +index_options = { + "indexed_field": "title", # Replace with appropriate field for indexing + "modality": "embedding", + "topic_model": True, + "duplicate_detection": True, + "embedding_model": "NomicEmbed", +} +dataset.create_index(name="MongoDB_StackOverflow_Questions", **index_options) + +# Print information about the AtlasDataset +print(dataset) From d58b9f63740bf4df2abe4f260425447e32b23072 Mon Sep 17 00:00:00 2001 From: akgom <132290469+akgom@users.noreply.github.com> Date: Thu, 27 Jun 2024 09:28:14 -0400 Subject: [PATCH 4/6] Update mongodb_connector.py --- maps/mongodb_connector.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/maps/mongodb_connector.py b/maps/mongodb_connector.py index b0bb487..8532455 100644 --- a/maps/mongodb_connector.py +++ b/maps/mongodb_connector.py @@ -41,7 +41,6 @@ dataset = AtlasDataset( "MongoDB_StackOverflow_Questions", unique_id_field="mongo_id", - reset_dataset_if_exists=True, is_public=True, ) @@ -68,7 +67,7 @@ "duplicate_detection": True, "embedding_model": "NomicEmbed", } -dataset.create_index(name="MongoDB_StackOverflow_Questions", **index_options) +dataset.create_index(**index_options) # Print information about the AtlasDataset print(dataset) From dea0dd53c9ecbe5b4d1fb89f0d2aa5c9d1f8f5b2 Mon Sep 17 00:00:00 2001 From: akgom <132290469+akgom@users.noreply.github.com> Date: Thu, 27 Jun 2024 11:44:56 -0400 Subject: [PATCH 5/6] Update mongodb_connector.py --- maps/mongodb_connector.py | 50 +++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/maps/mongodb_connector.py b/maps/mongodb_connector.py index 8532455..85753ad 100644 --- a/maps/mongodb_connector.py +++ b/maps/mongodb_connector.py @@ -1,46 +1,50 @@ import pymongo as pm -from nomic import AtlasDataset -from sentence_transformers import SentenceTransformer -import numpy as np import pandas as pd from pathlib import Path +from nomic import AtlasDataset, embed +import numpy as np # Replace with your MongoDB connection string and certificate file path -client = pm.MongoClient('mongodb+srv://:@cluster0.l3jhqfs.mongodb.net/testdb' - '?authSource=%24external&authMechanism=MONGODB-X509&retryWrites=true&w=majority', - tls=True, - tlsCertificateKeyFile='mongocert.pem') +client = pm.MongoClient( + 'mongodb+srv://:@cluster0.l3jhqfs.mongodb.net/testdb' + '?authSource=%24external&authMechanism=MONGODB-X509&retryWrites=true&w=majority', + tls=True, + tlsCertificateKeyFile='mongocert.pem' +) -collection = client.testdb.testcoll +# Access or create MongoDB collection +db = client.testdb +collection = db.mongo_so -# Delete current content of collection +# Clear existing content in collection collection.delete_many({}) -# Load embedding data into MongoDB +# Load data into DataFrame mongo_so = pd.read_parquet(Path.cwd() / 'data' / 'mongo-so.parquet') -# Initialize SentenceTransformer model -model = SentenceTransformer('all-MiniLM-L6-v2') +# Initialize Nomic text embedding model +output = embed.text( + texts=mongo_so['title'].tolist(), + model='nomic-embed-text-v1.5', + inference_mode='local', # Use local inference +) -# Encode titles into embeddings -title_embeds = model.encode(mongo_so['title'].tolist()) +# Extract embeddings +title_embeds = output['embeddings'] # Assign embeddings to DataFrame -mso_te = mongo_so.assign(title_embedding=list(title_embeds)) +mongo_so['title_embedding'] = title_embeds # Convert DataFrame to list of dictionaries for MongoDB insertion -data = mso_te.to_dict(orient='records') -for d in data: - del d['Index'] - d['title_embedding'] = d['title_embedding'].tolist() +data = mongo_so.to_dict(orient='records') # Insert data into MongoDB collection collection.insert_many(data) -# Read MongoDB collection with embeddings and map it using AtlasDataset +# Initialize AtlasDataset for mapped data dataset = AtlasDataset( "MongoDB_StackOverflow_Questions", - unique_id_field="mongo_id", + unique_id_field="mongo_id", # Replace with appropriate unique identifier field is_public=True, ) @@ -56,7 +60,7 @@ del d['title_embedding'] del d['_id'] -# Add embeddings to AtlasDataset +# Add data and embeddings to AtlasDataset dataset.add_data(data=all_items, embeddings=embs) # Create index in the dataset @@ -65,7 +69,7 @@ "modality": "embedding", "topic_model": True, "duplicate_detection": True, - "embedding_model": "NomicEmbed", + "embedding_model": "nomic-embed-text-v1.5", # Specify Nomic embedding model } dataset.create_index(**index_options) From 45731213283f7399a91bc73d10b90b29165cd720 Mon Sep 17 00:00:00 2001 From: akgom <132290469+akgom@users.noreply.github.com> Date: Mon, 8 Jul 2024 11:28:02 -0400 Subject: [PATCH 6/6] Update mongodb_connector.py --- maps/mongodb_connector.py | 105 +++++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 48 deletions(-) diff --git a/maps/mongodb_connector.py b/maps/mongodb_connector.py index 85753ad..4278b66 100644 --- a/maps/mongodb_connector.py +++ b/maps/mongodb_connector.py @@ -1,39 +1,34 @@ import pymongo as pm +import nomic +from nomic import AtlasDataset +from sentence_transformers import SentenceTransformer +from pymongo.mongo_client import MongoClient +import numpy as np import pandas as pd from pathlib import Path -from nomic import AtlasDataset, embed -import numpy as np -# Replace with your MongoDB connection string and certificate file path -client = pm.MongoClient( - 'mongodb+srv://:@cluster0.l3jhqfs.mongodb.net/testdb' - '?authSource=%24external&authMechanism=MONGODB-X509&retryWrites=true&w=majority', - tls=True, - tlsCertificateKeyFile='mongocert.pem' -) +# MongoDB connection string +client = MongoClient('mongodb+srv://:@.1fy6rp1.mongodb.net/?appName=', + tls=True) + +# Replace with your actual API key +nomic.login('YOUR_KEY_HERE') -# Access or create MongoDB collection -db = client.testdb -collection = db.mongo_so +# MongoDB collection +collection = client.sample_mflix.comments -# Clear existing content in collection +# Delete current content of collection collection.delete_many({}) -# Load data into DataFrame +# Load embedding data into MongoDB from parquet file mongo_so = pd.read_parquet(Path.cwd() / 'data' / 'mongo-so.parquet') -# Initialize Nomic text embedding model -output = embed.text( - texts=mongo_so['title'].tolist(), - model='nomic-embed-text-v1.5', - inference_mode='local', # Use local inference -) - -# Extract embeddings -title_embeds = output['embeddings'] +# Initialize SentenceTransformer model +model = SentenceTransformer('all-MiniLM-L6-v2') -# Assign embeddings to DataFrame -mongo_so['title_embedding'] = title_embeds +# Encode titles using SentenceTransformer +title_embeds = model.encode(mongo_so['title'].tolist()) +mongo_so['title_embedding'] = list(title_embeds) # Convert DataFrame to list of dictionaries for MongoDB insertion data = mongo_so.to_dict(orient='records') @@ -41,37 +36,51 @@ # Insert data into MongoDB collection collection.insert_many(data) -# Initialize AtlasDataset for mapped data -dataset = AtlasDataset( - "MongoDB_StackOverflow_Questions", - unique_id_field="mongo_id", # Replace with appropriate unique identifier field - is_public=True, -) - -# Retrieve all items from MongoDB collection +# Fetch all items from MongoDB collection all_items = list(collection.find()) # Extract embeddings into numpy array embs = np.array([d['title_embedding'] for d in all_items]) -# Prepare items for AtlasDataset by converting _id to mongo_id and removing embeddings +# Remove 'title_embedding' field from each item, and convert '_id' to string for d in all_items: - d['mongo_id'] = str(d['_id']) + d['_id'] = str(d['_id']) del d['title_embedding'] - del d['_id'] -# Add data and embeddings to AtlasDataset +# Create an AtlasDataset instance +dataset = AtlasDataset( + identifier='sample-mflix-comments', # Unique identifier for your dataset + description='MongoDB Movie Comments', + unique_id_field='_id', + is_public=True +) + +# Add data and embeddings to the AtlasDataset dataset.add_data(data=all_items, embeddings=embs) -# Create index in the dataset -index_options = { - "indexed_field": "title", # Replace with appropriate field for indexing - "modality": "embedding", - "topic_model": True, - "duplicate_detection": True, - "embedding_model": "nomic-embed-text-v1.5", # Specify Nomic embedding model -} -dataset.create_index(**index_options) - -# Print information about the AtlasDataset +# Create an index and map +dataset.create_index( + name='MongoDB Movie Comments', + indexed_field='body', # Replace with your topic label field + modality='embedding', + topic_model={ + 'build_topic_model': True, + 'topic_label_field': 'body' # Replace with the field used for topic labeling + }, + duplicate_detection={ + 'tag_duplicates': True, + 'duplicate_cutoff': 0.95 # Adjust as needed + }, + projection={ + 'n_neighbors': 15, # Example value, adjust as needed + 'n_epochs': 100, # Example value, adjust as needed + 'model': 'nomic-project-v2', + 'local_neighborhood_size': 30, + 'spread': 1.0, + 'rho': 0.5 + }, + embedding_model='NomicEmbed' # Specify the embedding model if needed +) + +# Print the dataset to confirm print(dataset)