code snippet load and read a portion of a large dump file using the PAWS server (whatever that is), just seemed useful for the outreachy internship
Sun Nov 24 2024 15:27:26 GMT+0000 (Coordinated Universal Time)
Saved by
@SamiraYS
lang = "en" # language code for ENGLISH
project_name = "wiki" # wikipedia namespace
namespace = 0 # 0 for Main/ Articles
date = "20220420"
DUMP_DIR = "/public/dumps/public/other/enterprise_html/runs" # directory on PAWS server that holds Wikimedia dumps
HTML_DUMP_FN = os.path.join(
DUMP_DIR,
date,
f"{lang+project_name}-NS{namespace}-{date}-ENTERPRISE-HTML.json.tar.gz",
) # final file path
print(
f"Reading {HTML_DUMP_FN} of size {os.path.getsize(HTML_DUMP_FN)/(1024*1024*1024)} GB"
)
article_list = []
with tarfile.open(HTML_DUMP_FN, mode="r:gz") as tar:
html_fn = tar.next()
print(
f"We will be working with {html_fn.name} ({html_fn.size / 1000000000:0.3f} GB)."
)
# extract the first article from the first tar chunk
with tar.extractfile(html_fn) as fin:
for line in fin:
article = json.loads(line)
break
content_copyCOPY
https://appledora.hashnode.dev/outreach-bw3?source
Comments