import mysql.connector
import pandas as pd
import matplotlib.pyplot as plt

# Establish connection to MySQL
mydb = mysql.connector.connect(
    host="localhost",
    user="root",
    password="Eimaipolykala1",
    database="twitter_db"
)

# Create cursor
cursor = mydb.cursor()

# Execute SQL query to retrieve data
cursor.execute("SELECT JSON_VALUE(data, '$.extended_tweet.display_text_range[1]') AS text_length FROM data_db")

# Fetch data
data = cursor.fetchall()

# Close the connection
mydb.close()

# Create DataFrame
df = pd.DataFrame(data, columns=['Text_Length'])

# Convert Text_Length to numeric
df['Text_Length'] = pd.to_numeric(df['Text_Length'], errors='coerce')

# Drop NaN values
df = df.dropna()

# Plotting
plt.figure(figsize=(10, 6))
plt.boxplot(df['Text_Length'])
plt.xlabel('Text Length')
plt.ylabel('Number of Characters')
plt.title('Distribution of Text Length in Extended Tweets')
plt.show()