import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
def download_n_eb_articles(n: int) -> pd.DataFrame:
"""Extract n Ekstra Bladet articles from the Danish subset
of the mC4 dataset.
Args:
n (int): Number of articles to extract.
Returns:
pd.DataFrame: Ekstra Bladet articles.
"""
mc4 = load_dataset("mc4", "da", streaming=True)
i, docs = 0, []
with tqdm(total=n) as pbar:
for doc in mc4["train"]:
if "ekstrabladet.dk" in doc["url"]:
docs.append(doc)
i += 1
pbar.update(1)
if i == n:
break
return pd.DataFrame(docs)
@kasperjunge
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter