Download n Ekstra Bladet new articles from the Danish mC4 dataset.

PHOTO EMBED

Thu Aug 18 2022 19:04:03 GMT+0000 (Coordinated Universal Time)

Saved by @juungeone ##python ##pandas ##mc4

import pandas as pd
from tqdm import tqdm
from datasets import load_dataset

def download_n_eb_articles(n: int) -> pd.DataFrame:
    """Extract n Ekstra Bladet articles from the Danish subset
    of the mC4 dataset.
    Args:
        n (int): Number of articles to extract.
    Returns:
        pd.DataFrame: Ekstra Bladet articles.
    """
    
    mc4 = load_dataset("mc4", "da", streaming=True)
    i, docs = 0, []
    
    with tqdm(total=n) as pbar:
      
        for doc in mc4["train"]:
          
            if "ekstrabladet.dk" in doc["url"]:
            
                docs.append(doc)
                i += 1
                pbar.update(1)
                
                if i == n:
                    break
                    
    return pd.DataFrame(docs)
@kasperjunge
content_copyCOPY