gte large model
Wed Feb 26 2025 10:50:56 GMT+0000 (Coordinated Universal Time)
Saved by @piyushkumar121 #python
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
import logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
class TextEmbedder:
def __init__(self, api_key: str = None):
"""
Initialize TextEmbedder with a sentence-transformer model.
The api_key parameter is kept for backward compatibility but is not used.
"""
# Load the sentence transformer model (api_key is not needed, kept for compatibility)
try:
# Using thenlper/gte-large model as specified
self.model = SentenceTransformer('thenlper/gte-large')
logger.info("Successfully loaded sentence-transformers model: thenlper/gte-large")
except Exception as e:
logger.error(f"Error loading sentence-transformers model: {str(e)}")
raise
def _combine_text_features(self, row: pd.Series, text_columns: List[str]) -> str:
"""
Combine multiple text columns from a series into a single text feature.
"""
text_values = []
for col in text_columns:
if col in row and pd.notna(row[col]):
text_values.append(f"{col}: {str(row[col])}")
return " | ".join(text_values)
def get_brand_text_features(self, brand: pd.Series) -> str:
"""
Extract relevant text features from brand data.
"""
text_columns = [
'industry',
'target_audience',
'brand_messaging',
'tone_voice',
'category_alignment',
'brand_alignment_keywords',
'content_type'
]
return self._combine_text_features(brand, text_columns)
def get_influencer_text_features(self, influencer: pd.Series) -> str:
"""
Extract relevant text features from influencer data.
"""
text_columns = [
'category_niche',
'audience_demographics',
'audience_interests',
'content_types'
]
return self._combine_text_features(influencer, text_columns)
def get_embedding(self, text: str) -> np.ndarray:
"""
Generate embeddings for a text using thenlper/gte-large model.
"""
try:
if not text or text.isspace():
# Return zero vector if text is empty or only whitespace
return np.zeros(self.model.get_sentence_embedding_dimension())
# Get embedding from sentence-transformers
embedding = self.model.encode(text)
return embedding
except Exception as e:
logger.error(f"Error getting embedding: {str(e)}")
# Return zero vector with the correct dimensions for the model
return np.zeros(self.model.get_sentence_embedding_dimension())
def calculate_text_similarity(self, brand_text: str, influencer_text: str) -> float:
"""
Calculate cosine similarity between brand and influencer text using cos_sim.
"""
if not brand_text or not influencer_text:
logger.warning("Empty text provided for similarity calculation")
return 0.0
brand_embedding = self.get_embedding(brand_text)
influencer_embedding = self.get_embedding(influencer_text)
# Using cos_sim from sentence_transformers.util
similarity = cos_sim(
brand_embedding.reshape(1, -1),
influencer_embedding.reshape(1, -1)
)[0][0].item() # Extract the float value from the tensor
return float(similarity)
def print_detailed_match_analysis(self, brand: pd.Series, influencer: pd.Series, similarity_score: float):
"""
Print detailed analysis of the match between a brand and influencer.
"""
print("\n" + "="*80)
print("Brand Details:")
print(f" ID: {brand.name}")
print(f" Name: {brand.get('name', 'Unknown Brand')}")
print("\nInfluencer Details:")
print(f" ID: {influencer.name}")
print(f" Name: {influencer.get('name', 'Unknown Influencer')}")
print("-"*80)
print("\nBrand Text Features:")
brand_text = self.get_brand_text_features(brand)
for feature in brand_text.split(" | "):
print(f" - {feature}")
print("\nInfluencer Text Features:")
influencer_text = self.get_influencer_text_features(influencer)
for feature in influencer_text.split(" | "):
print(f" - {feature}")
print("\nText Similarity Analysis:")
print(f" Score: {similarity_score:.4f}")
print("\nScore Interpretation:")
if similarity_score >= 0.8:
print(" Excellent Match (≥0.8):")
print(" - Very strong text similarity")
print(" - High potential for successful collaboration")
print(" - Strong alignment in multiple areas")
elif similarity_score >= 0.6:
print(" Good Match (≥0.6):")
print(" - Significant text similarity")
print(" - Good potential for collaboration")
print(" - Notable alignment in key areas")
elif similarity_score >= 0.4:
print(" Moderate Match (≥0.4):")
print(" - Some text similarity")
print(" - Potential for collaboration with careful consideration")
print(" - Partial alignment in some areas")
else:
print(" Weak Match (<0.4):")
print(" - Limited text similarity")
print(" - May need to reconsider match")
print(" - Limited alignment in key areas")
print("="*80)
def get_text_similarity_matrix(self, brands_df: pd.DataFrame,
influencers_df: pd.DataFrame) -> np.ndarray:
"""
Calculate text similarity matrix between all brands and influencers.
"""
similarity_matrix = np.zeros((len(brands_df), len(influencers_df)))
print("\nCalculating Text Similarity Scores:")
print("="*80)
all_scores = []
total_comparisons = len(brands_df) * len(influencers_df)
completed = 0
for i, brand in brands_df.iterrows():
brand_text = self.get_brand_text_features(brand)
for j, influencer in influencers_df.iterrows():
influencer_text = self.get_influencer_text_features(influencer)
similarity = self.calculate_text_similarity(brand_text, influencer_text)
similarity_matrix[brands_df.index.get_loc(i),
influencers_df.index.get_loc(j)] = similarity
all_scores.append({
'brand_id': brand.name,
'brand_name': brand.get('name', 'Unknown Brand'),
'influencer_id': influencer.name,
'influencer_name': influencer.get('name', 'Unknown Influencer'),
'similarity_score': similarity
})
self.print_detailed_match_analysis(brand, influencer, similarity)
completed += 1
if completed % 10 == 0 or completed == total_comparisons:
logger.info(f"Progress: {completed}/{total_comparisons} comparisons ({(completed/total_comparisons)*100:.1f}%)")
scores_df = pd.DataFrame(all_scores)
scores_df = scores_df.sort_values('similarity_score', ascending=False)
print("\nTop 10 Text Similarity Matches:")
print("="*80)
print(scores_df[['brand_id', 'brand_name', 'influencer_id', 'influencer_name', 'similarity_score']].head(10).to_string(index=False))
print("="*80)
return similarity_matrix
def save_similarity_scores(self, brands_df: pd.DataFrame,
influencers_df: pd.DataFrame,
output_path: str):
"""
Calculate and save all similarity scores to a CSV file.
"""
all_scores = []
total_comparisons = len(brands_df) * len(influencers_df)
completed = 0
logger.info(f"Starting to calculate similarity scores for {total_comparisons} brand-influencer pairs")
for i, brand in brands_df.iterrows():
brand_text = self.get_brand_text_features(brand)
for j, influencer in influencers_df.iterrows():
influencer_text = self.get_influencer_text_features(influencer)
similarity = self.calculate_text_similarity(brand_text, influencer_text)
all_scores.append({
'brand_id': brand.name,
'brand_name': brand.get('name', 'Unknown Brand'),
'influencer_id': influencer.name,
'influencer_name': influencer.get('name', 'Unknown Influencer'),
'similarity_score': similarity,
'brand_text': brand_text,
'influencer_text': influencer_text
})
completed += 1
if completed % 20 == 0 or completed == total_comparisons:
logger.info(f"Progress: {completed}/{total_comparisons} ({(completed/total_comparisons)*100:.1f}%)")
scores_df = pd.DataFrame(all_scores)
scores_df = scores_df.sort_values('similarity_score', ascending=False)
scores_df.to_csv(output_path, index=False)
logger.info(f"Saved detailed similarity scores to {output_path}")



Comments