lang = "en" # language code for ENGLISH
project_name = "wiki" # wikipedia namespace
namespace = 0 # 0 for Main/ Articles
date = "20220420"
DUMP_DIR = "/public/dumps/public/other/enterprise_html/runs" # directory on PAWS server that holds Wikimedia dumps
HTML_DUMP_FN = os.path.join(
DUMP_DIR,
date,
f"{lang+project_name}-NS{namespace}-{date}-ENTERPRISE-HTML.json.tar.gz",
) # final file path
print(
f"Reading {HTML_DUMP_FN} of size {os.path.getsize(HTML_DUMP_FN)/(1024*1024*1024)} GB"
)
article_list = []
with tarfile.open(HTML_DUMP_FN, mode="r:gz") as tar:
html_fn = tar.next()
print(
f"We will be working with {html_fn.name} ({html_fn.size / 1000000000:0.3f} GB)."
)
# extract the first article from the first tar chunk
with tar.extractfile(html_fn) as fin:
for line in fin:
article = json.loads(line)
break
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter