Creating tweet_data with languages filtered

PHOTO

Tue May 28 2024 21:37:36 GMT+0000 (Coordinated Universal Time)

from pymongo import MongoClient

mongo_uri = "mongodb://localhost:27017/"
database_name = "twitter_database"

source_collection_name = "tweets"
target_collection_name = "tweet_data"

fields_to_extract = [
    "json_data.id",
    "json_data.user.id",
    "json_data.created_at",
    "json_data.in_reply_to_status_id",
    "json_data.in_reply_to_user_id",
    "json_data.lang",
    "json_data.place",
    "json_data.user.location",
    "json_data.is_quote_status"
]

# not sure why but it works :)
def get_nested_field(data, field_path):
    keys = field_path.split('.')
    for key in keys:
        data = data.get(key)
        if data is None:
            return None
    return data
client = MongoClient(mongo_uri)
db = client[database_name]
source_collection = db[source_collection_name]
target_collection = db[target_collection_name]

cursor = source_collection.find()
for doc in cursor:
    lang = get_nested_field(doc, "json_data.lang")
    
    # Check if lang
    if lang in ["en", "nl", "es"]:
        # Create a new document with only the specified fields
        new_doc = {field: get_nested_field(doc, field) for field in fields_to_extract}
        new_doc["_id"] = doc["_id"]  # Keep the original _id
                target_collection.insert_one(new_doc)

client.close()
print("Data successfully transferred to new collections.")

COPY