Product Recommendation System with Large Language Models | Sentence Transformers | Amazon dataset

Posted by
from pandas import pandas as pd
import numpy as np

# code for jupyter notebook cell in full width
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_colwidth', None)


# ## Read and process Data

df = pd.read_csv("amazon.csv")

def preprocess_text(text):
    return text.lower()


df['product_name'] = df['product_name'].apply(preprocess_text)
df['category'] = df['category'].apply(preprocess_text)
df['about_product'] = df['about_product'].apply(preprocess_text)

# Combine product name and category text
df['combined_text'] = df['product_name'] + ' ' + df['category'] + ' ' + df['about_product']


# Keep only two columns
df = df[['product_name','category','about_product','combined_text']]



# ## Sentence Transformers

# Sentence Transformer can be used to embed sentences into a vector space.
# This is useful for tasks such as text classification or semantic similarity where sentences need to be compared.

# Sentence Transformers can be used to compute the similarity between two sentences.
# This is useful for tasks such as information retrieval, where you need to find documents similar to a given query.

# Sentence Transformers can be used to perform semantic searches.
# This is useful for tasks such as question answering, where you must find documents containing answers to a given question.

# A sentence Transformer can be used to cluster documents.
# This is useful for tasks such as topic modeling or document classification, where you need to group documents by topic or category.


#pip install sentence-transformers

from sentence_transformers import SentenceTransformer
# Load a pre-trained sentence transformer model (e.g., 'bert-base-nli-mean-tokens')
model = SentenceTransformer('bert-base-nli-mean-tokens')


# ## Embeddings

# In natural language processing and machine learning, "embedding" refers to the process of converting words, sentences,
# or entire documents into numerical vectors or representations. 


# Encode the combined text into embeddings
df['combined_embeddings'] = df['combined_text'].apply(lambda x: model.encode(x))




# # # Save the DataFrame with embeddings to a CSV file
# df.to_csv('data_with_embeddings.csv', index=False)
# # #df.to_csv('embeddings.csv', index=False, float_format='%.10f') 

# # # Load the DataFrame with embeddings from the CSV file
# df = pd.read_csv('data_with_embeddings.csv')



# ## Cosine similarity is a metric used to measure the similarity of two vectors.

from sentence_transformers import util

# Function to get similar items based on combined name and category
def get_similar_items(combined_input, df, top_n=5):
    
    combined_embedding = model.encode(combined_input)
    
    # Calculate cosine similarity between the combined input and all other combined texts
    similarities = util.pytorch_cos_sim(combined_embedding, df['combined_embeddings'])
    
    #print(similarities)
    # Get the indices of top N similar items
    similar_indices = similarities.argsort(descending=True, axis=1)[0][:top_n]
    #print(similar_indices)
    # Retrieve the similar items from the DataFrame
    similar_items_df = df.iloc[similar_indices][['product_name', 'category', 'about_product']]
    #print(similar_items)
    return similar_items_df
    


# # Making Simillar Product Serach based on Input


# Get similar items based on combined name and category
product_info_to_search = "  Usb Cable  1 meter"

similar_items = get_similar_items(product_info_to_search, df)


# Print the formatted output
print(f"Product: {product_info_to_search}")
print("\nSimilar Products:")
for idx, row in similar_items.iterrows():
    print(f"Product : {row['product_name']}")





Leave a Reply

Your email address will not be published. Required fields are marked *