import pandas as pd import numpy as np from typing import List, Dict, Tuple from sklearn.metrics.pairwise import cosine_similarity from sentence_transformers import SentenceTransformer import logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger(__name__) class TextEmbedder: def __init__(self, api_key: str = None): """ Initialize TextEmbedder with a sentence-transformer model. The api_key parameter is kept for backward compatibility but is not used. """ # Load the sentence transformer model (api_key is not needed, kept for compatibility) try: # all-MiniLM-L6-v2 is a good balance of speed and performance self.model = SentenceTransformer('all-MiniLM-L6-v2') logger.info("Successfully loaded sentence-transformers model: all-MiniLM-L6-v2") except Exception as e: logger.error(f"Error loading sentence-transformers model: {str(e)}") raise def _combine_text_features(self, row: pd.Series, text_columns: List[str]) -> str: """ Combine multiple text columns from a series into a single text feature. """ text_values = [] for col in text_columns: if col in row and pd.notna(row[col]): text_values.append(f"{col}: {str(row[col])}") return " | ".join(text_values) def get_brand_text_features(self, brand: pd.Series) -> str: """ Extract relevant text features from brand data. """ text_columns = [ 'industry', 'target_audience', 'brand_messaging', 'tone_voice', 'category_alignment', 'brand_alignment_keywords', 'content_type' ] return self._combine_text_features(brand, text_columns) def get_influencer_text_features(self, influencer: pd.Series) -> str: """ Extract relevant text features from influencer data. """ text_columns = [ 'category_niche', 'audience_demographics', 'audience_interests', 'content_types' ] return self._combine_text_features(influencer, text_columns) def get_embedding(self, text: str) -> np.ndarray: """ Generate embeddings for a text using sentence-transformers. """ try: if not text or text.isspace(): # Return zero vector if text is empty or only whitespace return np.zeros(self.model.get_sentence_embedding_dimension()) # Get embedding from sentence-transformers embedding = self.model.encode(text) return embedding except Exception as e: logger.error(f"Error getting embedding: {str(e)}") # Return zero vector with the correct dimensions for the model return np.zeros(self.model.get_sentence_embedding_dimension()) def calculate_text_similarity(self, brand_text: str, influencer_text: str) -> float: """ Calculate cosine similarity between brand and influencer text. """ if not brand_text or not influencer_text: logger.warning("Empty text provided for similarity calculation") return 0.0 brand_embedding = self.get_embedding(brand_text) influencer_embedding = self.get_embedding(influencer_text) similarity = cosine_similarity( brand_embedding.reshape(1, -1), influencer_embedding.reshape(1, -1) )[0][0] return float(similarity) def print_detailed_match_analysis(self, brand: pd.Series, influencer: pd.Series, similarity_score: float): """ Print detailed analysis of the match between a brand and influencer. """ print("\n" + "="*80) print("Brand Details:") print(f" ID: {brand.name}") print(f" Name: {brand.get('name', 'Unknown Brand')}") print("\nInfluencer Details:") print(f" ID: {influencer.name}") print(f" Name: {influencer.get('name', 'Unknown Influencer')}") print("-"*80) print("\nBrand Text Features:") brand_text = self.get_brand_text_features(brand) for feature in brand_text.split(" | "): print(f" - {feature}") print("\nInfluencer Text Features:") influencer_text = self.get_influencer_text_features(influencer) for feature in influencer_text.split(" | "): print(f" - {feature}") print("\nText Similarity Analysis:") print(f" Score: {similarity_score:.4f}") print("\nScore Interpretation:") if similarity_score >= 0.8: print(" Excellent Match (≥0.8):") print(" - Very strong text similarity") print(" - High potential for successful collaboration") print(" - Strong alignment in multiple areas") elif similarity_score >= 0.6: print(" Good Match (≥0.6):") print(" - Significant text similarity") print(" - Good potential for collaboration") print(" - Notable alignment in key areas") elif similarity_score >= 0.4: print(" Moderate Match (≥0.4):") print(" - Some text similarity") print(" - Potential for collaboration with careful consideration") print(" - Partial alignment in some areas") else: print(" Weak Match (<0.4):") print(" - Limited text similarity") print(" - May need to reconsider match") print(" - Limited alignment in key areas") print("="*80) def get_text_similarity_matrix(self, brands_df: pd.DataFrame, influencers_df: pd.DataFrame) -> np.ndarray: """ Calculate text similarity matrix between all brands and influencers. """ similarity_matrix = np.zeros((len(brands_df), len(influencers_df))) print("\nCalculating Text Similarity Scores:") print("="*80) all_scores = [] total_comparisons = len(brands_df) * len(influencers_df) completed = 0 for i, brand in brands_df.iterrows(): brand_text = self.get_brand_text_features(brand) for j, influencer in influencers_df.iterrows(): influencer_text = self.get_influencer_text_features(influencer) similarity = self.calculate_text_similarity(brand_text, influencer_text) similarity_matrix[brands_df.index.get_loc(i), influencers_df.index.get_loc(j)] = similarity all_scores.append({ 'brand_id': brand.name, 'brand_name': brand.get('name', 'Unknown Brand'), 'influencer_id': influencer.name, 'influencer_name': influencer.get('name', 'Unknown Influencer'), 'similarity_score': similarity }) self.print_detailed_match_analysis(brand, influencer, similarity) completed += 1 if completed % 10 == 0 or completed == total_comparisons: logger.info(f"Progress: {completed}/{total_comparisons} comparisons ({(completed/total_comparisons)*100:.1f}%)") scores_df = pd.DataFrame(all_scores) scores_df = scores_df.sort_values('similarity_score', ascending=False) print("\nTop 10 Text Similarity Matches:") print("="*80) print(scores_df[['brand_id', 'brand_name', 'influencer_id', 'influencer_name', 'similarity_score']].head(10).to_string(index=False)) print("="*80) return similarity_matrix def save_similarity_scores(self, brands_df: pd.DataFrame, influencers_df: pd.DataFrame, output_path: str): """ Calculate and save all similarity scores to a CSV file. """ all_scores = [] total_comparisons = len(brands_df) * len(influencers_df) completed = 0 logger.info(f"Starting to calculate similarity scores for {total_comparisons} brand-influencer pairs") for i, brand in brands_df.iterrows(): brand_text = self.get_brand_text_features(brand) for j, influencer in influencers_df.iterrows(): influencer_text = self.get_influencer_text_features(influencer) similarity = self.calculate_text_similarity(brand_text, influencer_text) all_scores.append({ 'brand_id': brand.name, 'brand_name': brand.get('name', 'Unknown Brand'), 'influencer_id': influencer.name, 'influencer_name': influencer.get('name', 'Unknown Influencer'), 'similarity_score': similarity, 'brand_text': brand_text, 'influencer_text': influencer_text }) completed += 1 if completed % 20 == 0 or completed == total_comparisons: logger.info(f"Progress: {completed}/{total_comparisons} ({(completed/total_comparisons)*100:.1f}%)") scores_df = pd.DataFrame(all_scores) scores_df = scores_df.sort_values('similarity_score', ascending=False) scores_df.to_csv(output_path, index=False) logger.info(f"Saved detailed similarity scores to {output_path}")
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter