multi txt to pandas

PHOTO

Fri Sep 10 2021 02:09:39 GMT+0000 (Coordinated Universal Time)

import os
import re
import codecs
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

print("Please Enter the Exact Dataset Main Folder Name")
folder_name = input()

labels = os.listdir ("original_datasets/" + folder_name)
if ".DS_Store" in labels:
    labels.remove(".DS_Store")

all_data = []

# uncomment to debug
# labels = labels[:5]
counter = 0

for i in labels:
    instances_in_a_label = os.listdir ("original_datasets/" + folder_name + '/' + i)
    all_data_for_a_label = []
    for j in instances_in_a_label:
    # uncomment to debug
    #   if counter < 2:
        f = open("original_datasets/" + folder_name + '/' + i + '/' + j, "r", encoding='latin-1')
        raw_data = f.read()
        preprocessed_data = re.sub('[^a-zA-Z]', ' ', raw_data).lower()
        preprocessed_data = preprocessed_data.split()
        preprocessed_data = [word for word in preprocessed_data if word not in stopwords.words('english')]
        preprocessed_data = ' '.join(preprocessed_data)
        all_data.append([j, preprocessed_data, i])
    #        counter += 1
    # counter = 0

all_data = np.asarray(all_data)
df = pd.DataFrame(all_data)
print("===========DataFrame-Complete===========")

df.to_csv('pre_processed_df/pre_processed_' + folder_name + '.csv', index=False)

COPY

Save snippets that work from anywhere online with our extensions

Comments

text-preprocessing

@QuinnFox12

.partition() (picking up piece of string between separators) findall and search text between 2 strings apply function and def Python regex related snippets convert tra to sim chinese remove punc and stopword chinese most common words for each sector and visualize preprocessing Text Full and path convert dataframe to txt, to list Working with null nan nul preprocessing Text Full and path colab common useful snippets multi txt to pandas convert stopword list from sim to tra n-gram with filter POS tag Retrain spacy tagger pos working with time series snippets Extract Text From PDF with Python time in pandas dataframe python vlookup dataframe remove blank string in list check if 2 strings in string

pandas

@QuinnFox12

Combine columns in dataframe create and save dataframe to csv replace function in dataframe apply function and def convert tra to sim chinese create and save dataframe to csv remove punc and stopword chinese preprocessing Text Full and path convert dataframe to txt, to list Working with null nan nul preprocessing Text Full and path multi txt to pandas Pandas selection iloc loc note working with time series snippets group by the first column and get second column as lists in rows: Remove blank cell in pandas dataframe time in pandas dataframe python vlookup dataframe select pandas column as dataframe instead of series dataframe all related Concate dataframe in for loop

NLP

@QuinnFox12

convert tra to sim chinese remove punc and stopword chinese Chinese POS most common words for each sector and visualize preprocessing Text Full and path convert dataframe to txt, to list preprocessing Text Full and path colab common useful snippets multi txt to pandas convert stopword list from sim to tra Pandas selection iloc loc note n-gram with filter POS tag Retrain spacy tagger pos Train Fasttext - GloVe algorithm on my own corpus Extract Text From PDF with Python tạo wordcloud

#dax #tabular.editor #live.dataset #ms.pbi #dataset

Showing DAX Expressions When Using Power BI Datasets

foreach (var m in Model.AllMeasures) {
  m.Description = m.Expression;
}

#gnn #dataset

GNN dataset create1

class MyOwnDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None):
        '''Root: where the dataset should be store, this folder is split into raw_dir (downloaded dataset)
        and processed_dir (processed data)'''
        super(cateDataset, self).__init__(root, transform, pre_transform)

    @property
    def raw_file_names(self):
            '''if this file exists in raw_dir, the download is not triggered,
            the download func is not implemented here)'''
        return 'cate_id_01.csv'

    @property
    def processed_file_names(self):
        '''if these files are found in raw_dir, processing is skipped'''

        return 'not_implemented.pt'

    def download(self):
        # Download to `self.raw_dir`.
        #path = download_url(url, self.raw_dir)
        pass

    def process(self):
        self.data = pd.read_csv(self.raw_paths[0])
        for index, mol in tqdm(self.data.iterrows(), total =self.data.shape[0]):
            cate = df["categories"]
            categories_main = df["categories_main"]
            #get node features
            node_feats = self._get_node_features(cate)
            # get edge features
            edge_feats = self._get_edge_features(cate)
            # get adjancy infor
            edge_index = self._get_adjacency_info(cate)
            
            #get labels info
            label = self._get_labels(categories_main)
            
            # create data object
            data = Data(x=node_feats, 
                       edge_index = edge_index,
                       edge_attr = edge_feats,
                       y = label)
            torch.save(data,
                      os.path.join(sef.processed_dir,
                                  f'data_{index}.pt))
             # Create data object
            data = Data(x=node_feats, 
                        edge_index=edge_index,
                        edge_attr=edge_feats,
                        y=label,
                        smiles=mol["smiles"]
                        ) 
            if self.test:
                torch.save(data, 
                    os.path.join(self.processed_dir, 
                                 f'data_test_{index}.pt'))
            else:
                torch.save(data, 
                    os.path.join(self.processed_dir, 
                                 f'data_{index}.pt'))
   def _get_node_features(self, mol):
        """ 
        This will return a matrix / 2d array of the shape
        [Number of Nodes, Node Feature size]
        """
        all_node_feats = []

        for atom in mol.GetAtoms():
            node_feats = []
            # Feature 1: Atomic number        
            node_feats.append(atom.GetAtomicNum())
            # Feature 2: Atom degree
            node_feats.append(atom.GetDegree())
            # Feature 3: Formal charge
            node_feats.append(atom.GetFormalCharge())
            # Feature 4: Hybridization
            node_feats.append(atom.GetHybridization())
            # Feature 5: Aromaticity
            node_feats.append(atom.GetIsAromatic())
            # Feature 6: Total Num Hs
            node_feats.append(atom.GetTotalNumHs())
            # Feature 7: Radical Electrons
            node_feats.append(atom.GetNumRadicalElectrons())
            # Feature 8: In Ring
            node_feats.append(atom.IsInRing())
            # Feature 9: Chirality
            node_feats.append(atom.GetChiralTag())

            # Append node features to matrix
            all_node_feats.append(node_feats)

        all_node_feats = np.asarray(all_node_feats)
        return torch.tensor(all_node_feats, dtype=torch.float)

    def _get_edge_features(self, mol):
        """ 
        This will return a matrix / 2d array of the shape
        [Number of edges, Edge Feature size]
        """
        all_edge_feats = []

        for bond in mol.GetBonds():
            edge_feats = []
            # Feature 1: Bond type (as double)
            edge_feats.append(bond.GetBondTypeAsDouble())
            # Feature 2: Rings
            edge_feats.append(bond.IsInRing())
            # Append node features to matrix (twice, per direction)
            all_edge_feats += [edge_feats, edge_feats]

        all_edge_feats = np.asarray(all_edge_feats)
        return torch.tensor(all_edge_feats, dtype=torch.float)

    def _get_adjacency_info(self, mol):
        """
        We could also use rdmolops.GetAdjacencyMatrix(mol)
        but we want to be sure that the order of the indices
        matches the order of the edge features
        """
        edge_indices = []
        for bond in mol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()
            edge_indices += [[i, j], [j, i]]

        edge_indices = torch.tensor(edge_indices)
        edge_indices = edge_indices.t().to(torch.long).view(2, -1)
        return edge_indices

    def _get_labels(self, label):
        label = np.asarray([label])
        return torch.tensor(label, dtype=torch.int64)

    def len(self):
        return self.data.shape[0]

    def get(self, idx):
        """ - Equivalent to __getitem__ in pytorch
            - Is not needed for PyG's InMemoryDataset
        """
        if self.test:
            data = torch.load(os.path.join(self.processed_dir, 
                                 f'data_test_{idx}.pt'))
        else:
            data = torch.load(os.path.join(self.processed_dir, 
                                 f'data_{idx}.pt'))   
        return data

#gnn #dataset

GNN dataset create2

import torch
from torch_geometric.data import InMemoryDataset
from tqdm import tqdm

class classDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(classDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []
    @property
    def processed_file_names(self):
        return ['./train_vec.csv']

    def download(self):
        pass
    
    def process(self):
        
        data_list = []
        # process by label_vec
        # treat each email in a label categories as a node, and therefore all emails in the same label form a graph
        grouped = df.groupby('label_vec') # group the preprocessed data by label_vec and iterate over these groups (use for)
        for label_vec, group in tqdm(grouped):
            # each iteration, the text_vec in each group are categorically encoded again
            label_email_id = LabelEncoder().fit_transform(group.text_vec) # since for each graph, the node index should count from 0. 
            group = group.reset_index(drop=True)
            group['label_email_id'] = label_email_id
            node_features = group.loc[group.label_vec==label_vec,['label_email_id','text_vec']].sort_values('label_email_id').label_vec.drop_duplicates().values

            node_features = torch.LongTensor(node_features).unsqueeze(1)
            target_nodes = group.label_email_id.values[1:]
            source_nodes = group.label_email_id.values[:-1]

            edge_index = torch.tensor([source_nodes,
                                   target_nodes], dtype=torch.long)
            x = node_features

            y = torch.FloatTensor([group.label_vec.values[0]]).type(torch.LongTensor)

            data = Data(x=x, edge_index=edge_index, y=y)
            data_list.append(data)
        
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

#dataset

multi txt to pandas

import os
import re
import codecs
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

print("Please Enter the Exact Dataset Main Folder Name")
folder_name = input()

labels = os.listdir ("original_datasets/" + folder_name)
if ".DS_Store" in labels:
    labels.remove(".DS_Store")

all_data = []

# uncomment to debug
# labels = labels[:5]
counter = 0

for i in labels:
    instances_in_a_label = os.listdir ("original_datasets/" + folder_name + '/' + i)
    all_data_for_a_label = []
    for j in instances_in_a_label:
    # uncomment to debug
    #   if counter < 2:
        f = open("original_datasets/" + folder_name + '/' + i + '/' + j, "r", encoding='latin-1')
        raw_data = f.read()
        preprocessed_data = re.sub('[^a-zA-Z]', ' ', raw_data).lower()
        preprocessed_data = preprocessed_data.split()
        preprocessed_data = [word for word in preprocessed_data if word not in stopwords.words('english')]
        preprocessed_data = ' '.join(preprocessed_data)
        all_data.append([j, preprocessed_data, i])
    #        counter += 1
    # counter = 0

all_data = np.asarray(all_data)
df = pd.DataFrame(all_data)
print("===========DataFrame-Complete===========")

df.to_csv('pre_processed_df/pre_processed_' + folder_name + '.csv', index=False)

#jupyter #notebook #titanic #dataset

titanic - Jupyter Notebook

cat_type = CategoricalDtype(categories=['3', '2', '1'], ordered=True)
cat_type2 = CategoricalDtype(categories=['Kind','Jong','Middelbaar','Oud'], ordered=True)

df1['pclass'] = df1['pclass'].map({1: '1', 2: '2', 3: '3'})
df1['survived'] = df1['survived'].map({1: True, 0: False})
df1['sex'] = df1['sex'].map({'male': 'M', 'female': 'V'})
df1['pclass'] = df1['pclass'].astype(cat_type)
df1['sex'] = df1['sex'].astype('category')
df1['age'] = df1['age'].map(lambda x: round(x))
df1['age'] = df1['age'].astype('int8')
df1['fare'] = df1['fare'].map(lambda x: round(x, 2))
df1['age_cat'] = pd.cut(df1['age'], bins=4, labels=('Kind','Jong','Middelbaar','Oud'))
df1['age_cat'] = df1['age_cat'].astype(cat_type2)
df1 = df1.filter(items=['pclass', 'name', 'survived', 'sex', 'age', 'age_cat', 'fare'])
df1

#python #pandas #dataset #numerical #categorical #eda

Printing, listing and counting the numerical or categorical features in a dataset Pandas

# Get the Numerical Data list to infer distribution plots

numerical = [var for var in df.columns if df[var].dtype!='O'] 
print('There are {} numerical variables\n'.format(len(numerical))) 
print('The numerical variables are :', numerical)

# Get the Categorical Data list to infer distribution plots

categorical = [var for var in df.columns if df[var].dtype =='O'] 
print('There are {} Categorical variables\n'.format(len(categorical))) 
print('The Categorical variables are :', categorical)

#python #pandas #dataset #eda #missingdata

Quantifying Missing data with charts

data.isnull().mean().plot.bar(figsize=(12,6))
plt.ylabel('Percentage of missing values')
plt.xlabel('Variables')
plt.title('Quantifying missing data')

#python #pandas #dataset #eda #cardibality

Quantifying cardinality In datasets

data.nunique().plot.bar(figsize=(12,6))
plt.ylabel('Number of unique categories')
plt.xlabel('Variables')
plt.title('Cardinality')

## Version with 5% threshold

fig = label_freq.sort_values(ascending=False).plot.bar()
fig.axhline(y=0.05, color='red')
fig.set_ylabel('percentage of cars within each category')
fig.set_xlabel('Variable: class')
fig.set_title('Identifying Rare Categories')
plt.show()

#python #pandas #dataset #eda #outliers #boundaries

Function to find boundaries for outliers

def find_boundaries(df, variable, distance=1.5):

    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)

    lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
    upper_boundary = df[variable].quantile(0.75) + (IQR * distance)

    return upper_boundary, lower_boundary

#python #pandas #dataset #eda #extractyear #datetime

Extract the year from a date data In pandas

df['year'] = df['date'].dt.year if we put dt.month is for month etc..
df.head()

#array #dataset #html

Add an array to a HTML dataset

import "./styles.css";

const date = new Date();
const root = document.getElementById("app");
root.setAttribute("data-posts", "[]");

const items = [
  {
    title: "book 1",
    author: "David",
  },
  {
    title: "book 2",
    author: "Kevin",
  }
];

const posts = () => {
  setTimeout(() => {
    let outPut = `<ul>`;
    for (let item of items) {
      outPut += `<li>${item.title}</li>`;
    }
    outPut += `</ul>`;
    root.innerHTML = outPut;
  }, 500);
};



const createPost = (post, callback) => {
  const item = {
    title: post.title,
    author: post.author,
  };

  items.push(item); // push the item to the items array
  callback(); //evoke the callback function

// set the posts data attribute to take in all the items
  root.dataset.posts = JSON.stringify(items);
};

console.log(
  createPost({ title: "book 3", author: "Derek"}, posts)
);

#dataset #prevent #duplicate #event #listener

Prevent multple click events

/**
 if rerendering occurs or to stop duplicate events

if (button.dataset.recShareTracked) return;
button.dataset.recShareTracked = "true";

*/



// example

 function recShareTracking() {
        const recUserActions = document.querySelector(".rec-user-actions");

        if (!recUserActions) return;

        const shareButtons = recUserActions.querySelectorAll(
            ".rec-user-actions__cta-button",
        );

        if (!shareButtons.length) return;

        console.log({shareButtons})

        shareButtons.forEach((button) => {
            // Prevent duplicate listeners
            if (button.dataset.recShareTracked) return;
            button.dataset.recShareTracked = "true";

            button.addEventListener("click", (e) => {
                const clickedElement = e.target;
                const isButtonClick =
                    clickedElement === button ||
                    clickedElement.closest(".rec-user-actions__cta-button") ===
                        button;

                if (!isButtonClick) return;

                const buttonTextRaw = button.textContent?.trim() || "";
                const buttonText = buttonTextRaw.replace(/\s+/g, " ").trim() || "";

                if (buttonText) {
                    gtmPush({
                        event: "interaction_click",
                        component_name: "button",
                        click_text: buttonText,
                        click_url: null,
                    });
                }
            });
        });
    }

multi txt to pandas

Save snippets that work from anywhere online with our extensions

Comments

More like this

text-preprocessing

pandas

NLP

Browse more snippets >>

multi txt to pandas

Save snippets that work from anywhere online with our extensions

Comments

More like this

text-preprocessing

pandas

NLP

Browse more snippets >>

Embed code snippet