FINAL DATA CLEANING
Tue Jun 18 2024 20:13:01 GMT+0000 (Coordinated Universal Time)
Saved by @madgakantara
from pymongo import MongoClient, errors mongo_uri = "mongodb://localhost:27017/" database_name = "twitter_database" source_collection_name = "tweets" target_collection_name = "final" fields_to_extract = [ "json_data.id", "json_data.user.id", "json_data.created_at", "json_data.in_reply_to_status_id", "json_data.in_reply_to_user_id", "json_data.entities.user_mentions", "json_data.lang", "json_data.user.location", ] airline_id_wanted =[56377143, 106062176, 124476322, 18332190, 22536055,20626359] def get_nested_field(data, field_path): keys = field_path.split('.') for key in keys: if isinstance(data, list): try: key = int(key) data = data[key] except (ValueError, IndexError): return None elif isinstance(data, dict): data = data.get(key) else: return None if data is None: return None return data try: client = MongoClient(mongo_uri, serverSelectionTimeoutMS=5000) db = client[database_name] source_collection = db[source_collection_name] target_collection = db[target_collection_name] cursor = source_collection.find() for doc in cursor: new_doc = {field: get_nested_field(doc, field) for field in fields_to_extract} new_doc["_id"] = doc["_id"] # Check if extended_tweet.full_text is present, if not use json_data.text full_text = get_nested_field(doc, "json_data.extended_tweet.full_text") if full_text: new_doc["json_data.text"] = full_text else: new_doc["json_data.text"] = get_nested_field(doc, "json_data.text") # Insert the new document into the target collection target_collection.insert_one(new_doc) print("Data successfully transferred to new collections.") except errors.ServerSelectionTimeoutError as err: print("Failed to connect to MongoDB server:", err) except errors.PyMongoError as err: print("An error occurred while working with MongoDB:", err) finally: client.close()
Comments