thiscodeWorks | thiscodeWorks

lang = "en"  # language code for ENGLISH
project_name = "wiki"  # wikipedia namespace
namespace = 0  # 0 for Main/ Articles
date = "20220420"
DUMP_DIR = "/public/dumps/public/other/enterprise_html/runs"  # directory on PAWS server that holds Wikimedia dumps
HTML_DUMP_FN = os.path.join(
    DUMP_DIR,
    date,
    f"{lang+project_name}-NS{namespace}-{date}-ENTERPRISE-HTML.json.tar.gz",
)  # final file path

print(
    f"Reading {HTML_DUMP_FN} of size {os.path.getsize(HTML_DUMP_FN)/(1024*1024*1024)} GB"
)

article_list = []
with tarfile.open(HTML_DUMP_FN, mode="r:gz") as tar:
    html_fn = tar.next()
    print(
        f"We will be working with {html_fn.name} ({html_fn.size / 1000000000:0.3f} GB)."
    )
    # extract the first article from the first tar chunk
    with tar.extractfile(html_fn) as fin:
        for line in fin:
            article = json.loads(line)
            break