bert model for 9000

PHOTO EMBED

Sun Aug 11 2024 10:40:48 GMT+0000 (Coordinated Universal Time)

Saved by @christiana

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score

# Step 1: Load the dataset
file_path = './dataset/Amazon-Product-Reviews - Amazon Product Review (1).csv'
df = pd.read_csv(file_path)

# Step 2: Check the first few rows and column names
print("First few rows of the dataset:")
print(df.head())

print("\nColumns in the dataset:")
print(df.columns)

# Step 3: Handling missing values
df = df.dropna()

# Step 4: Convert categorical variables to numeric
categorical_columns = ['marketplace', 'product_id', 'product_title', 'product_category', 'vine', 'verified_purchase', 'review_headline']
label_encoders = {}

for column in categorical_columns:
    if column in df.columns:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le

# Step 5: Feature Engineering
if 'review_date' in df.columns:
    df['Year'] = pd.to_datetime(df['review_date']).dt.year

# Extract features and target
X_text = df['review_body']  # Textual data
y = df['sentiment']

# Split the data
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42
)

# Step 6: Prepare data for BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=512)

train_encodings = tokenize_function(X_train_text.tolist())
test_encodings = tokenize_function(X_test_text.tolist())

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, y_train)
test_dataset = SentimentDataset(test_encodings, y_test)

# Step 7: Train a BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

# Step 8: Evaluate the model
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(axis=1)
accuracy = accuracy_score(y_test, preds)

print(f"\nAccuracy of the BERT model: {accuracy:.4f}")
content_copyCOPY