bert model for 9000
Sun Aug 11 2024 10:40:48 GMT+0000 (Coordinated Universal Time)
Saved by @christiana
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from transformers import BertTokenizer, BertForSequenceClassification from transformers import Trainer, TrainingArguments import torch from sklearn.metrics import accuracy_score # Step 1: Load the dataset file_path = './dataset/Amazon-Product-Reviews - Amazon Product Review (1).csv' df = pd.read_csv(file_path) # Step 2: Check the first few rows and column names print("First few rows of the dataset:") print(df.head()) print("\nColumns in the dataset:") print(df.columns) # Step 3: Handling missing values df = df.dropna() # Step 4: Convert categorical variables to numeric categorical_columns = ['marketplace', 'product_id', 'product_title', 'product_category', 'vine', 'verified_purchase', 'review_headline'] label_encoders = {} for column in categorical_columns: if column in df.columns: le = LabelEncoder() df[column] = le.fit_transform(df[column]) label_encoders[column] = le # Step 5: Feature Engineering if 'review_date' in df.columns: df['Year'] = pd.to_datetime(df['review_date']).dt.year # Extract features and target X_text = df['review_body'] # Textual data y = df['sentiment'] # Split the data X_train_text, X_test_text, y_train, y_test = train_test_split( X_text, y, test_size=0.2, random_state=42 ) # Step 6: Prepare data for BERT tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') def tokenize_function(texts): return tokenizer(texts, padding='max_length', truncation=True, max_length=512) train_encodings = tokenize_function(X_train_text.tolist()) test_encodings = tokenize_function(X_test_text.tolist()) class SentimentDataset(torch.utils.data.Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} item['labels'] = torch.tensor(self.labels.iloc[idx]) return item def __len__(self): return len(self.labels) train_dataset = SentimentDataset(train_encodings, y_train) test_dataset = SentimentDataset(test_encodings, y_test) # Step 7: Train a BERT model model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2) training_args = TrainingArguments( output_dir='./results', num_train_epochs=3, per_device_train_batch_size=8, per_device_eval_batch_size=8, warmup_steps=500, weight_decay=0.01, logging_dir='./logs', logging_steps=10, ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset, ) trainer.train() # Step 8: Evaluate the model predictions = trainer.predict(test_dataset) preds = predictions.predictions.argmax(axis=1) accuracy = accuracy_score(y_test, preds) print(f"\nAccuracy of the BERT model: {accuracy:.4f}")
Comments