from pandas import pandas as pd
import numpy as np
# code for jupyter notebook cell in full width
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_colwidth', None)
# ## Read and process Data
df = pd.read_csv("amazon.csv")
def preprocess_text(text):
return text.lower()
df['product_name'] = df['product_name'].apply(preprocess_text)
df['category'] = df['category'].apply(preprocess_text)
df['about_product'] = df['about_product'].apply(preprocess_text)
# Combine product name and category text
df['combined_text'] = df['product_name'] + ' ' + df['category'] + ' ' + df['about_product']
# Keep only two columns
df = df[['product_name','category','about_product','combined_text']]
# ## Sentence Transformers
# Sentence Transformer can be used to embed sentences into a vector space.
# This is useful for tasks such as text classification or semantic similarity where sentences need to be compared.
# Sentence Transformers can be used to compute the similarity between two sentences.
# This is useful for tasks such as information retrieval, where you need to find documents similar to a given query.
# Sentence Transformers can be used to perform semantic searches.
# This is useful for tasks such as question answering, where you must find documents containing answers to a given question.
# A sentence Transformer can be used to cluster documents.
# This is useful for tasks such as topic modeling or document classification, where you need to group documents by topic or category.
#pip install sentence-transformers
from sentence_transformers import SentenceTransformer
# Load a pre-trained sentence transformer model (e.g., 'bert-base-nli-mean-tokens')
model = SentenceTransformer('bert-base-nli-mean-tokens')
# ## Embeddings
# In natural language processing and machine learning, "embedding" refers to the process of converting words, sentences,
# or entire documents into numerical vectors or representations.
# Encode the combined text into embeddings
df['combined_embeddings'] = df['combined_text'].apply(lambda x: model.encode(x))
# # # Save the DataFrame with embeddings to a CSV file
# df.to_csv('data_with_embeddings.csv', index=False)
# # #df.to_csv('embeddings.csv', index=False, float_format='%.10f')
# # # Load the DataFrame with embeddings from the CSV file
# df = pd.read_csv('data_with_embeddings.csv')
# ## Cosine similarity is a metric used to measure the similarity of two vectors.
from sentence_transformers import util
# Function to get similar items based on combined name and category
def get_similar_items(combined_input, df, top_n=5):
combined_embedding = model.encode(combined_input)
# Calculate cosine similarity between the combined input and all other combined texts
similarities = util.pytorch_cos_sim(combined_embedding, df['combined_embeddings'])
#print(similarities)
# Get the indices of top N similar items
similar_indices = similarities.argsort(descending=True, axis=1)[0][:top_n]
#print(similar_indices)
# Retrieve the similar items from the DataFrame
similar_items_df = df.iloc[similar_indices][['product_name', 'category', 'about_product']]
#print(similar_items)
return similar_items_df
# # Making Simillar Product Serach based on Input
# Get similar items based on combined name and category
product_info_to_search = " Usb Cable 1 meter"
similar_items = get_similar_items(product_info_to_search, df)
# Print the formatted output
print(f"Product: {product_info_to_search}")
print("\nSimilar Products:")
for idx, row in similar_items.iterrows():
print(f"Product : {row['product_name']}")
Product Recommendation System with Large Language Models | Sentence Transformers | Amazon dataset
Posted by