lang = "en" # language code for ENGLISH project_name = "wiki" # wikipedia namespace namespace = 0 # 0 for Main/ Articles date = "20220420" DUMP_DIR = "/public/dumps/public/other/enterprise_html/runs" # directory on PAWS server that holds Wikimedia dumps HTML_DUMP_FN = os.path.join( DUMP_DIR, date, f"{lang+project_name}-NS{namespace}-{date}-ENTERPRISE-HTML.json.tar.gz", ) # final file path print( f"Reading {HTML_DUMP_FN} of size {os.path.getsize(HTML_DUMP_FN)/(1024*1024*1024)} GB" ) article_list = [] with tarfile.open(HTML_DUMP_FN, mode="r:gz") as tar: html_fn = tar.next() print( f"We will be working with {html_fn.name} ({html_fn.size / 1000000000:0.3f} GB)." ) # extract the first article from the first tar chunk with tar.extractfile(html_fn) as fin: for line in fin: article = json.loads(line) break
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter