Preview:
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset

def download_n_eb_articles(n: int) -> pd.DataFrame:
    """Extract n Ekstra Bladet articles from the Danish subset
    of the mC4 dataset.
    Args:
        n (int): Number of articles to extract.
    Returns:
        pd.DataFrame: Ekstra Bladet articles.
    """
    
    mc4 = load_dataset("mc4", "da", streaming=True)
    i, docs = 0, []
    
    with tqdm(total=n) as pbar:
      
        for doc in mc4["train"]:
          
            if "ekstrabladet.dk" in doc["url"]:
            
                docs.append(doc)
                i += 1
                pbar.update(1)
                
                if i == n:
                    break
                    
    return pd.DataFrame(docs)
@kasperjunge
downloadDownload PNG downloadDownload JPEG downloadDownload SVG

Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!

Click to optimize width for Twitter