thiscodeWorks - Organizing the best of code online

import "./styles.css";

const date = new Date();
const root = document.getElementById("app");
root.setAttribute("data-posts", "[]");

const items = [
  {
    title: "book 1",
    author: "David",
  },
  {
    title: "book 2",
    author: "Kevin",
  }
];

const posts = () => {
  setTimeout(() => {
    let outPut = `<ul>`;
    for (let item of items) {
      outPut += `<li>${item.title}</li>`;
    }
    outPut += `</ul>`;
    root.innerHTML = outPut;
  }, 500);
};



const createPost = (post, callback) => {
  const item = {
    title: post.title,
    author: post.author,
  };

  items.push(item); // push the item to the items array
  callback(); //evoke the callback function

// set the posts data attribute to take in all the items
  root.dataset.posts = JSON.stringify(items);
};

console.log(
  createPost({ title: "book 3", author: "Derek"}, posts)
);

#python #pandas #dataset #eda #extractyear #datetime

Extract the year from a date data In pandas

df['year'] = df['date'].dt.year if we put dt.month is for month etc..
df.head()

#python #pandas #dataset #eda #outliers #boundaries

Function to find boundaries for outliers

def find_boundaries(df, variable, distance=1.5):

    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)

    lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
    upper_boundary = df[variable].quantile(0.75) + (IQR * distance)

    return upper_boundary, lower_boundary

#python #pandas #dataset #eda #cardibality

Quantifying cardinality In datasets

data.nunique().plot.bar(figsize=(12,6))
plt.ylabel('Number of unique categories')
plt.xlabel('Variables')
plt.title('Cardinality')

## Version with 5% threshold

fig = label_freq.sort_values(ascending=False).plot.bar()
fig.axhline(y=0.05, color='red')
fig.set_ylabel('percentage of cars within each category')
fig.set_xlabel('Variable: class')
fig.set_title('Identifying Rare Categories')
plt.show()

#python #pandas #dataset #eda #missingdata

Quantifying Missing data with charts

data.isnull().mean().plot.bar(figsize=(12,6))
plt.ylabel('Percentage of missing values')
plt.xlabel('Variables')
plt.title('Quantifying missing data')

#python #pandas #dataset #numerical #categorical #eda

Printing, listing and counting the numerical or categorical features in a dataset Pandas

# Get the Numerical Data list to infer distribution plots

numerical = [var for var in df.columns if df[var].dtype!='O'] 
print('There are {} numerical variables\n'.format(len(numerical))) 
print('The numerical variables are :', numerical)

# Get the Categorical Data list to infer distribution plots

categorical = [var for var in df.columns if df[var].dtype =='O'] 
print('There are {} Categorical variables\n'.format(len(categorical))) 
print('The Categorical variables are :', categorical)

#jupyter #notebook #titanic #dataset

titanic - Jupyter Notebook

cat_type = CategoricalDtype(categories=['3', '2', '1'], ordered=True)
cat_type2 = CategoricalDtype(categories=['Kind','Jong','Middelbaar','Oud'], ordered=True)

df1['pclass'] = df1['pclass'].map({1: '1', 2: '2', 3: '3'})
df1['survived'] = df1['survived'].map({1: True, 0: False})
df1['sex'] = df1['sex'].map({'male': 'M', 'female': 'V'})
df1['pclass'] = df1['pclass'].astype(cat_type)
df1['sex'] = df1['sex'].astype('category')
df1['age'] = df1['age'].map(lambda x: round(x))
df1['age'] = df1['age'].astype('int8')
df1['fare'] = df1['fare'].map(lambda x: round(x, 2))
df1['age_cat'] = pd.cut(df1['age'], bins=4, labels=('Kind','Jong','Middelbaar','Oud'))
df1['age_cat'] = df1['age_cat'].astype(cat_type2)
df1 = df1.filter(items=['pclass', 'name', 'survived', 'sex', 'age', 'age_cat', 'fare'])
df1

#dataset

multi txt to pandas

import os
import re
import codecs
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

print("Please Enter the Exact Dataset Main Folder Name")
folder_name = input()

labels = os.listdir ("original_datasets/" + folder_name)
if ".DS_Store" in labels:
    labels.remove(".DS_Store")

all_data = []

# uncomment to debug
# labels = labels[:5]
counter = 0

for i in labels:
    instances_in_a_label = os.listdir ("original_datasets/" + folder_name + '/' + i)
    all_data_for_a_label = []
    for j in instances_in_a_label:
    # uncomment to debug
    #   if counter < 2:
        f = open("original_datasets/" + folder_name + '/' + i + '/' + j, "r", encoding='latin-1')
        raw_data = f.read()
        preprocessed_data = re.sub('[^a-zA-Z]', ' ', raw_data).lower()
        preprocessed_data = preprocessed_data.split()
        preprocessed_data = [word for word in preprocessed_data if word not in stopwords.words('english')]
        preprocessed_data = ' '.join(preprocessed_data)
        all_data.append([j, preprocessed_data, i])
    #        counter += 1
    # counter = 0

all_data = np.asarray(all_data)
df = pd.DataFrame(all_data)
print("===========DataFrame-Complete===========")

df.to_csv('pre_processed_df/pre_processed_' + folder_name + '.csv', index=False)

#gnn #dataset

GNN dataset create2

import torch
from torch_geometric.data import InMemoryDataset
from tqdm import tqdm

class classDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(classDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []
    @property
    def processed_file_names(self):
        return ['./train_vec.csv']

    def download(self):
        pass
    
    def process(self):
        
        data_list = []
        # process by label_vec
        # treat each email in a label categories as a node, and therefore all emails in the same label form a graph
        grouped = df.groupby('label_vec') # group the preprocessed data by label_vec and iterate over these groups (use for)
        for label_vec, group in tqdm(grouped):
            # each iteration, the text_vec in each group are categorically encoded again
            label_email_id = LabelEncoder().fit_transform(group.text_vec) # since for each graph, the node index should count from 0. 
            group = group.reset_index(drop=True)
            group['label_email_id'] = label_email_id
            node_features = group.loc[group.label_vec==label_vec,['label_email_id','text_vec']].sort_values('label_email_id').label_vec.drop_duplicates().values

            node_features = torch.LongTensor(node_features).unsqueeze(1)
            target_nodes = group.label_email_id.values[1:]
            source_nodes = group.label_email_id.values[:-1]

            edge_index = torch.tensor([source_nodes,
                                   target_nodes], dtype=torch.long)
            x = node_features

            y = torch.FloatTensor([group.label_vec.values[0]]).type(torch.LongTensor)

            data = Data(x=x, edge_index=edge_index, y=y)
            data_list.append(data)
        
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

#gnn #dataset

GNN dataset create1

class MyOwnDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None):
        '''Root: where the dataset should be store, this folder is split into raw_dir (downloaded dataset)
        and processed_dir (processed data)'''
        super(cateDataset, self).__init__(root, transform, pre_transform)

    @property
    def raw_file_names(self):
            '''if this file exists in raw_dir, the download is not triggered,
            the download func is not implemented here)'''
        return 'cate_id_01.csv'

    @property
    def processed_file_names(self):
        '''if these files are found in raw_dir, processing is skipped'''

        return 'not_implemented.pt'

    def download(self):
        # Download to `self.raw_dir`.
        #path = download_url(url, self.raw_dir)
        pass

    def process(self):
        self.data = pd.read_csv(self.raw_paths[0])
        for index, mol in tqdm(self.data.iterrows(), total =self.data.shape[0]):
            cate = df["categories"]
            categories_main = df["categories_main"]
            #get node features
            node_feats = self._get_node_features(cate)
            # get edge features
            edge_feats = self._get_edge_features(cate)
            # get adjancy infor
            edge_index = self._get_adjacency_info(cate)
            
            #get labels info
            label = self._get_labels(categories_main)
            
            # create data object
            data = Data(x=node_feats, 
                       edge_index = edge_index,
                       edge_attr = edge_feats,
                       y = label)
            torch.save(data,
                      os.path.join(sef.processed_dir,
                                  f'data_{index}.pt))
             # Create data object
            data = Data(x=node_feats, 
                        edge_index=edge_index,
                        edge_attr=edge_feats,
                        y=label,
                        smiles=mol["smiles"]
                        ) 
            if self.test:
                torch.save(data, 
                    os.path.join(self.processed_dir, 
                                 f'data_test_{index}.pt'))
            else:
                torch.save(data, 
                    os.path.join(self.processed_dir, 
                                 f'data_{index}.pt'))
   def _get_node_features(self, mol):
        """ 
        This will return a matrix / 2d array of the shape
        [Number of Nodes, Node Feature size]
        """
        all_node_feats = []

        for atom in mol.GetAtoms():
            node_feats = []
            # Feature 1: Atomic number        
            node_feats.append(atom.GetAtomicNum())
            # Feature 2: Atom degree
            node_feats.append(atom.GetDegree())
            # Feature 3: Formal charge
            node_feats.append(atom.GetFormalCharge())
            # Feature 4: Hybridization
            node_feats.append(atom.GetHybridization())
            # Feature 5: Aromaticity
            node_feats.append(atom.GetIsAromatic())
            # Feature 6: Total Num Hs
            node_feats.append(atom.GetTotalNumHs())
            # Feature 7: Radical Electrons
            node_feats.append(atom.GetNumRadicalElectrons())
            # Feature 8: In Ring
            node_feats.append(atom.IsInRing())
            # Feature 9: Chirality
            node_feats.append(atom.GetChiralTag())

            # Append node features to matrix
            all_node_feats.append(node_feats)

        all_node_feats = np.asarray(all_node_feats)
        return torch.tensor(all_node_feats, dtype=torch.float)

    def _get_edge_features(self, mol):
        """ 
        This will return a matrix / 2d array of the shape
        [Number of edges, Edge Feature size]
        """
        all_edge_feats = []

        for bond in mol.GetBonds():
            edge_feats = []
            # Feature 1: Bond type (as double)
            edge_feats.append(bond.GetBondTypeAsDouble())
            # Feature 2: Rings
            edge_feats.append(bond.IsInRing())
            # Append node features to matrix (twice, per direction)
            all_edge_feats += [edge_feats, edge_feats]

        all_edge_feats = np.asarray(all_edge_feats)
        return torch.tensor(all_edge_feats, dtype=torch.float)

    def _get_adjacency_info(self, mol):
        """
        We could also use rdmolops.GetAdjacencyMatrix(mol)
        but we want to be sure that the order of the indices
        matches the order of the edge features
        """
        edge_indices = []
        for bond in mol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()
            edge_indices += [[i, j], [j, i]]

        edge_indices = torch.tensor(edge_indices)
        edge_indices = edge_indices.t().to(torch.long).view(2, -1)
        return edge_indices

    def _get_labels(self, label):
        label = np.asarray([label])
        return torch.tensor(label, dtype=torch.int64)

    def len(self):
        return self.data.shape[0]

    def get(self, idx):
        """ - Equivalent to __getitem__ in pytorch
            - Is not needed for PyG's InMemoryDataset
        """
        if self.test:
            data = torch.load(os.path.join(self.processed_dir, 
                                 f'data_test_{idx}.pt'))
        else:
            data = torch.load(os.path.join(self.processed_dir, 
                                 f'data_{idx}.pt'))   
        return data

#dax #tabular.editor #live.dataset #ms.pbi #dataset

Showing DAX Expressions When Using Power BI Datasets

foreach (var m in Model.AllMeasures) {
  m.Description = m.Expression;
}

Add an array to a HTML dataset

Extract the year from a date data In pandas

Function to find boundaries for outliers

Quantifying cardinality In datasets

Quantifying Missing data with charts

Printing, listing and counting the numerical or categorical features in a dataset Pandas

titanic - Jupyter Notebook

multi txt to pandas

GNN dataset create2

GNN dataset create1

Showing DAX Expressions When Using Power BI Datasets

Save snippets that work with our extensions