Creating tweet_data with languages filtered
Tue May 28 2024 21:37:36 GMT+0000 (Coordinated Universal Time)
Saved by
@madgakantara
from pymongo import MongoClient
mongo_uri = "mongodb://localhost:27017/"
database_name = "twitter_database"
source_collection_name = "tweets"
target_collection_name = "tweet_data"
fields_to_extract = [
"json_data.id",
"json_data.user.id",
"json_data.created_at",
"json_data.in_reply_to_status_id",
"json_data.in_reply_to_user_id",
"json_data.lang",
"json_data.place",
"json_data.user.location",
"json_data.is_quote_status"
]
# not sure why but it works :)
def get_nested_field(data, field_path):
keys = field_path.split('.')
for key in keys:
data = data.get(key)
if data is None:
return None
return data
client = MongoClient(mongo_uri)
db = client[database_name]
source_collection = db[source_collection_name]
target_collection = db[target_collection_name]
cursor = source_collection.find()
for doc in cursor:
lang = get_nested_field(doc, "json_data.lang")
# Check if lang
if lang in ["en", "nl", "es"]:
# Create a new document with only the specified fields
new_doc = {field: get_nested_field(doc, field) for field in fields_to_extract}
new_doc["_id"] = doc["_id"] # Keep the original _id
target_collection.insert_one(new_doc)
client.close()
print("Data successfully transferred to new collections.")
content_copyCOPY
Comments