import pandas as pd from tqdm import tqdm from datasets import load_dataset def download_n_eb_articles(n: int) -> pd.DataFrame: """Extract n Ekstra Bladet articles from the Danish subset of the mC4 dataset. Args: n (int): Number of articles to extract. Returns: pd.DataFrame: Ekstra Bladet articles. """ mc4 = load_dataset("mc4", "da", streaming=True) i, docs = 0, [] with tqdm(total=n) as pbar: for doc in mc4["train"]: if "ekstrabladet.dk" in doc["url"]: docs.append(doc) i += 1 pbar.update(1) if i == n: break return pd.DataFrame(docs) @kasperjunge
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter