gte large model

PHOTO

Wed Feb 26 2025 10:50:56 GMT+0000 (Coordinated Universal Time)

Saved by @piyushkumar121 #python

import pandas as pd
import numpy as np
from typing import List, Dict, Tuple
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
import logging


logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

class TextEmbedder:
    def __init__(self, api_key: str = None):
        """
        Initialize TextEmbedder with a sentence-transformer model.
        The api_key parameter is kept for backward compatibility but is not used.
        """
        # Load the sentence transformer model (api_key is not needed, kept for compatibility)
        try:
            # Using thenlper/gte-large model as specified
            self.model = SentenceTransformer('thenlper/gte-large')
            logger.info("Successfully loaded sentence-transformers model: thenlper/gte-large")
        except Exception as e:
            logger.error(f"Error loading sentence-transformers model: {str(e)}")
            raise
        
    def _combine_text_features(self, row: pd.Series, text_columns: List[str]) -> str:
        """
        Combine multiple text columns from a series into a single text feature.
        """
        text_values = []
        for col in text_columns:
            if col in row and pd.notna(row[col]):
                text_values.append(f"{col}: {str(row[col])}")
        return " | ".join(text_values)
    
    def get_brand_text_features(self, brand: pd.Series) -> str:
        """
        Extract relevant text features from brand data.
        """
        text_columns = [
            'industry',
            'target_audience',
            'brand_messaging',
            'tone_voice',
            'category_alignment',
            'brand_alignment_keywords',
            'content_type'
        ]
        return self._combine_text_features(brand, text_columns)
    
    def get_influencer_text_features(self, influencer: pd.Series) -> str:
        """
        Extract relevant text features from influencer data.
        """
        text_columns = [
            'category_niche',
            'audience_demographics',
            'audience_interests',
            'content_types'
        ]
        return self._combine_text_features(influencer, text_columns)
    
    def get_embedding(self, text: str) -> np.ndarray:
        """
        Generate embeddings for a text using thenlper/gte-large model.
        """
        try:
            if not text or text.isspace():
                # Return zero vector if text is empty or only whitespace
                return np.zeros(self.model.get_sentence_embedding_dimension())
                
            # Get embedding from sentence-transformers
            embedding = self.model.encode(text)
            return embedding
        except Exception as e:
            logger.error(f"Error getting embedding: {str(e)}")
            # Return zero vector with the correct dimensions for the model
            return np.zeros(self.model.get_sentence_embedding_dimension())
            
    def calculate_text_similarity(self, brand_text: str, influencer_text: str) -> float:
        """
        Calculate cosine similarity between brand and influencer text using cos_sim.
        """
        if not brand_text or not influencer_text:
            logger.warning("Empty text provided for similarity calculation")
            return 0.0
            
        brand_embedding = self.get_embedding(brand_text)
        influencer_embedding = self.get_embedding(influencer_text)
        
        # Using cos_sim from sentence_transformers.util
        similarity = cos_sim(
            brand_embedding.reshape(1, -1),
            influencer_embedding.reshape(1, -1)
        )[0][0].item()  # Extract the float value from the tensor
        
        return float(similarity)

    def print_detailed_match_analysis(self, brand: pd.Series, influencer: pd.Series, similarity_score: float):
        """
        Print detailed analysis of the match between a brand and influencer.
        """
        print("\n" + "="*80)
      
        print("Brand Details:")
        print(f"  ID: {brand.name}") 
        print(f"  Name: {brand.get('name', 'Unknown Brand')}")
        
        print("\nInfluencer Details:")
        print(f"  ID: {influencer.name}") 
        print(f"  Name: {influencer.get('name', 'Unknown Influencer')}")
        print("-"*80)
        
        print("\nBrand Text Features:")
        brand_text = self.get_brand_text_features(brand)
        for feature in brand_text.split(" | "):
            print(f"  - {feature}")
            
        print("\nInfluencer Text Features:")
        influencer_text = self.get_influencer_text_features(influencer)
        for feature in influencer_text.split(" | "):
            print(f"  - {feature}")
            
        print("\nText Similarity Analysis:")
        print(f"  Score: {similarity_score:.4f}")
        
        print("\nScore Interpretation:")
        if similarity_score >= 0.8:
            print("  Excellent Match (≥0.8):")
            print("  - Very strong text similarity")
            print("  - High potential for successful collaboration")
            print("  - Strong alignment in multiple areas")
        elif similarity_score >= 0.6:
            print("  Good Match (≥0.6):")
            print("  - Significant text similarity")
            print("  - Good potential for collaboration")
            print("  - Notable alignment in key areas")
        elif similarity_score >= 0.4:
            print("  Moderate Match (≥0.4):")
            print("  - Some text similarity")
            print("  - Potential for collaboration with careful consideration")
            print("  - Partial alignment in some areas")
        else:
            print("  Weak Match (<0.4):")
            print("  - Limited text similarity")
            print("  - May need to reconsider match")
            print("  - Limited alignment in key areas")
        
        print("="*80)

    def get_text_similarity_matrix(self, brands_df: pd.DataFrame, 
                                 influencers_df: pd.DataFrame) -> np.ndarray:
        """
        Calculate text similarity matrix between all brands and influencers.
        """
        similarity_matrix = np.zeros((len(brands_df), len(influencers_df)))
        
        print("\nCalculating Text Similarity Scores:")
        print("="*80)
        
        all_scores = []
        
        total_comparisons = len(brands_df) * len(influencers_df)
        completed = 0
        
        for i, brand in brands_df.iterrows():
            brand_text = self.get_brand_text_features(brand)
            
            for j, influencer in influencers_df.iterrows():
                influencer_text = self.get_influencer_text_features(influencer)
                
                similarity = self.calculate_text_similarity(brand_text, influencer_text)
                similarity_matrix[brands_df.index.get_loc(i),
                                influencers_df.index.get_loc(j)] = similarity
                
                all_scores.append({
                    'brand_id': brand.name, 
                    'brand_name': brand.get('name', 'Unknown Brand'),
                    'influencer_id': influencer.name,
                    'influencer_name': influencer.get('name', 'Unknown Influencer'),
                    'similarity_score': similarity
                })
                
                self.print_detailed_match_analysis(brand, influencer, similarity)
                
                completed += 1
                if completed % 10 == 0 or completed == total_comparisons:
                    logger.info(f"Progress: {completed}/{total_comparisons} comparisons ({(completed/total_comparisons)*100:.1f}%)")
        
        scores_df = pd.DataFrame(all_scores)
        scores_df = scores_df.sort_values('similarity_score', ascending=False)
        
        print("\nTop 10 Text Similarity Matches:")
        print("="*80)
        print(scores_df[['brand_id', 'brand_name', 'influencer_id', 'influencer_name', 'similarity_score']].head(10).to_string(index=False))
        print("="*80)
        
        return similarity_matrix

    def save_similarity_scores(self, brands_df: pd.DataFrame, 
                             influencers_df: pd.DataFrame,
                             output_path: str):
        """
        Calculate and save all similarity scores to a CSV file.
        """
        all_scores = []
        total_comparisons = len(brands_df) * len(influencers_df)
        completed = 0
        
        logger.info(f"Starting to calculate similarity scores for {total_comparisons} brand-influencer pairs")
        
        for i, brand in brands_df.iterrows():
            brand_text = self.get_brand_text_features(brand)
            
            for j, influencer in influencers_df.iterrows():
                influencer_text = self.get_influencer_text_features(influencer)
                similarity = self.calculate_text_similarity(brand_text, influencer_text)
                
                all_scores.append({
                    'brand_id': brand.name,
                    'brand_name': brand.get('name', 'Unknown Brand'),
                    'influencer_id': influencer.name,
                    'influencer_name': influencer.get('name', 'Unknown Influencer'),
                    'similarity_score': similarity,
                    'brand_text': brand_text,
                    'influencer_text': influencer_text
                })
                
                completed += 1
                if completed % 20 == 0 or completed == total_comparisons:
                    logger.info(f"Progress: {completed}/{total_comparisons} ({(completed/total_comparisons)*100:.1f}%)")
        
        scores_df = pd.DataFrame(all_scores)
        scores_df = scores_df.sort_values('similarity_score', ascending=False)
        scores_df.to_csv(output_path, index=False)
        logger.info(f"Saved detailed similarity scores to {output_path}")

Save snippets that work from anywhere online with our extensions

Available in the Chrome Web Store

Get Firefox Add-on

Get VS Code extension

Comments

More like this

Importing images from a directory (Python) to list or dictionary

from PIL import Image
import glob
image_list = []
for filename in glob.glob('yourpath/*.gif'): #assuming gif
    im=Image.open(filename)
    image_list.append(im)

python - Find out the percentage of missing values in each column in the given dataset - Stack Overflow

percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

#python #python #loops #whileloop

Print the name of 7 days in a week - by using while loop

days = 0
week = [‘Monday’, ‘Tuesday’, ‘Wednesday’, ‘Thursday’, ‘Friday’, ‘Saturday’, 3.‘Sunday’]
while day < 7:
print(“Today is” + week[days])
days += 1

Getting the index of an item in a list containing it in Python

>>> ["foo", "bar", "baz"].index("bar")
1

#javascript #python #search #historicalcode #google #algorithms

Google’s PageRank Algorithm from 1996 - the origin of internet search

import numpy as np

def pagerank(M, num_iterations=100, d=0.85):
    N = M.shape[1]
    v = np.random.rand(N, 1)
    v = v / np.linalg.norm(v, 1)
    iteration = 0
    while iteration < num_iterations:
        iteration += 1
        v = d * np.matmul(M, v) + (1 - d) / N
    return v

#python #python #strings #vowels #function

Get vowels in strings

This method gets vowels (‘a’, ‘e’, ‘i’, ‘o’, ‘u’) found in a string.
   
#make a function:
def get_vowels(string):

#return is the keyword which means function have to return value: 
 return [each for each in string if each in 'aeiou']


#assign the words and function will return vowels words.
get_vowels('foobar') # ['o', 'o', 'a']


get_vowels('gym') # []

Could not build wheels for tokenizers which use PEP 517 and cannot be installed directly

https://github.com/pydata/bottleneck/issues/281

How To Bypass Cloudflare Bot Protection In Selenium - CodingTutz

options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(options=options)

Python Loop through Excel sheets, place into one df - Stack Overflow

import pandas as pd

sheets_dict = pd.read_excel('Book1.xlsx', sheetname=None)

full_table = pd.DataFrame()
for name, sheet in sheets_dict.items():
    sheet['sheet'] = name
    sheet = sheet.rename(columns=lambda x: x.split('\n')[-1])
    full_table = full_table.append(sheet)

full_table.reset_index(inplace=True, drop=True)

print full_table

#python #dates #functions #python3.8

How to parse a String into Datetime in Python

from datetime import datetime

datetime_object = datetime.strptime('Jun 1 2005  1:33PM', '%b %d %Y %I:%M%p')

python - Way to change Google Chrome user agent in Selenium? - Stack Overflow

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent

options = Options()
ua = UserAgent()
userAgent = ua.random
print(userAgent)
options.add_argument(f'user-agent={userAgent}')
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\WebDrivers\ChromeDriver\chromedriver_win32\chromedriver.exe')
driver.get("https://www.google.co.in")
driver.quit()

python - How to see the progress bar of read_csv - Stack Overflow

def read_csv_pgbar(csv_path, chunksize, usecols, dtype=object):


    # print('Getting row count of csv file')

    rows = sum(1 for _ in open(csv_path, 'r')) - 1 # minus the header
    # chunks = rows//chunksize + 1
    # print('Reading csv file')
    chunk_list = []

    with tqdm(total=rows, desc='Rows read: ') as bar:
        for chunk in pd.read_csv(csv_path, chunksize=chunksize, usecols=usecols, dtype=dtype):
            chunk_list.append(chunk)
            bar.update(len(chunk))

    df = pd.concat((f for f in chunk_list), axis=0)
    print('Finish reading csv file')

    return df

#python #python #lists #dictionary

Convert two lists into a dictionary

keys, values)) # {'a': 2, 'c': 4, 'b': 3}
 
 
#make a function: def is the keyword for the function:
def to_dictionary(keys, values):
 
 
#return is the keyword that tells program that function has to return value   
return dict(zip(keys, values))
 
  
 
# keys and values are the lists:
 
keys = ["a", "b", "c"]   
 
values = [2, 3, 4]

#python #interesting #arrays #sorting #interviewquestions

Sorting an array without changing position of negative numbers

# Python3 implementation of the approach 

# Function to sort the array such that 
# negative values do not get affected 
def sortArray(a, n): 

	# Store all non-negative values 
	ans=[] 
	for i in range(n): 
		if (a[i] >= 0): 
			ans.append(a[i]) 

	# Sort non-negative values 
	ans = sorted(ans) 

	j = 0
	for i in range(n): 

		# If current element is non-negative then 
		# update it such that all the 
		# non-negative values are sorted 
		if (a[i] >= 0): 
			a[i] = ans[j] 
			j += 1

	# Print the sorted array 
	for i in range(n): 
		print(a[i],end = " ") 


# Driver code 

arr = [2, -6, -3, 8, 4, 1] 

n = len(arr) 

sortArray(arr, n)

#python ##python #strings #comments

Create simple string along with variables

#assign a value to a variable:
types_of_people = 10 
# make a string using variable name:
X = f “there are {types_of_people} types of people.”

Output:
There are 10 types of people

Browse more snippets >>