Snippets Collections
import "./styles.css";

const date = new Date();
const root = document.getElementById("app");
root.setAttribute("data-posts", "[]");

const items = [
  {
    title: "book 1",
    author: "David",
  },
  {
    title: "book 2",
    author: "Kevin",
  }
];

const posts = () => {
  setTimeout(() => {
    let outPut = `<ul>`;
    for (let item of items) {
      outPut += `<li>${item.title}</li>`;
    }
    outPut += `</ul>`;
    root.innerHTML = outPut;
  }, 500);
};



const createPost = (post, callback) => {
  const item = {
    title: post.title,
    author: post.author,
  };

  items.push(item); // push the item to the items array
  callback(); //evoke the callback function

// set the posts data attribute to take in all the items
  root.dataset.posts = JSON.stringify(items);
};

console.log(
  createPost({ title: "book 3", author: "Derek"}, posts)
);
df['year'] = df['date'].dt.year if we put dt.month is for month etc..
df.head()
def find_boundaries(df, variable, distance=1.5):

    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)

    lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
    upper_boundary = df[variable].quantile(0.75) + (IQR * distance)

    return upper_boundary, lower_boundary
data.nunique().plot.bar(figsize=(12,6))
plt.ylabel('Number of unique categories')
plt.xlabel('Variables')
plt.title('Cardinality')

## Version with 5% threshold

fig = label_freq.sort_values(ascending=False).plot.bar()
fig.axhline(y=0.05, color='red')
fig.set_ylabel('percentage of cars within each category')
fig.set_xlabel('Variable: class')
fig.set_title('Identifying Rare Categories')
plt.show()
data.isnull().mean().plot.bar(figsize=(12,6))
plt.ylabel('Percentage of missing values')
plt.xlabel('Variables')
plt.title('Quantifying missing data')
# Get the Numerical Data list to infer distribution plots

numerical = [var for var in df.columns if df[var].dtype!='O'] 
print('There are {} numerical variables\n'.format(len(numerical))) 
print('The numerical variables are :', numerical)

# Get the Categorical Data list to infer distribution plots

categorical = [var for var in df.columns if df[var].dtype =='O'] 
print('There are {} Categorical variables\n'.format(len(categorical))) 
print('The Categorical variables are :', categorical)
cat_type = CategoricalDtype(categories=['3', '2', '1'], ordered=True)
cat_type2 = CategoricalDtype(categories=['Kind','Jong','Middelbaar','Oud'], ordered=True)
​
df1['pclass'] = df1['pclass'].map({1: '1', 2: '2', 3: '3'})
df1['survived'] = df1['survived'].map({1: True, 0: False})
df1['sex'] = df1['sex'].map({'male': 'M', 'female': 'V'})
df1['pclass'] = df1['pclass'].astype(cat_type)
df1['sex'] = df1['sex'].astype('category')
df1['age'] = df1['age'].map(lambda x: round(x))
df1['age'] = df1['age'].astype('int8')
df1['fare'] = df1['fare'].map(lambda x: round(x, 2))
df1['age_cat'] = pd.cut(df1['age'], bins=4, labels=('Kind','Jong','Middelbaar','Oud'))
df1['age_cat'] = df1['age_cat'].astype(cat_type2)
df1 = df1.filter(items=['pclass', 'name', 'survived', 'sex', 'age', 'age_cat', 'fare'])
df1
import os
import re
import codecs
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

print("Please Enter the Exact Dataset Main Folder Name")
folder_name = input()

labels = os.listdir ("original_datasets/" + folder_name)
if ".DS_Store" in labels:
    labels.remove(".DS_Store")

all_data = []

# uncomment to debug
# labels = labels[:5]
counter = 0

for i in labels:
    instances_in_a_label = os.listdir ("original_datasets/" + folder_name + '/' + i)
    all_data_for_a_label = []
    for j in instances_in_a_label:
    # uncomment to debug
    #   if counter < 2:
        f = open("original_datasets/" + folder_name + '/' + i + '/' + j, "r", encoding='latin-1')
        raw_data = f.read()
        preprocessed_data = re.sub('[^a-zA-Z]', ' ', raw_data).lower()
        preprocessed_data = preprocessed_data.split()
        preprocessed_data = [word for word in preprocessed_data if word not in stopwords.words('english')]
        preprocessed_data = ' '.join(preprocessed_data)
        all_data.append([j, preprocessed_data, i])
    #        counter += 1
    # counter = 0

all_data = np.asarray(all_data)
df = pd.DataFrame(all_data)
print("===========DataFrame-Complete===========")

df.to_csv('pre_processed_df/pre_processed_' + folder_name + '.csv', index=False)
import torch
from torch_geometric.data import InMemoryDataset
from tqdm import tqdm

class classDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(classDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []
    @property
    def processed_file_names(self):
        return ['./train_vec.csv']

    def download(self):
        pass
    
    def process(self):
        
        data_list = []
        # process by label_vec
        # treat each email in a label categories as a node, and therefore all emails in the same label form a graph
        grouped = df.groupby('label_vec') # group the preprocessed data by label_vec and iterate over these groups (use for)
        for label_vec, group in tqdm(grouped):
            # each iteration, the text_vec in each group are categorically encoded again
            label_email_id = LabelEncoder().fit_transform(group.text_vec) # since for each graph, the node index should count from 0. 
            group = group.reset_index(drop=True)
            group['label_email_id'] = label_email_id
            node_features = group.loc[group.label_vec==label_vec,['label_email_id','text_vec']].sort_values('label_email_id').label_vec.drop_duplicates().values

            node_features = torch.LongTensor(node_features).unsqueeze(1)
            target_nodes = group.label_email_id.values[1:]
            source_nodes = group.label_email_id.values[:-1]

            edge_index = torch.tensor([source_nodes,
                                   target_nodes], dtype=torch.long)
            x = node_features

            y = torch.FloatTensor([group.label_vec.values[0]]).type(torch.LongTensor)

            data = Data(x=x, edge_index=edge_index, y=y)
            data_list.append(data)
        
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])
class MyOwnDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None):
        '''Root: where the dataset should be store, this folder is split into raw_dir (downloaded dataset)
        and processed_dir (processed data)'''
        super(cateDataset, self).__init__(root, transform, pre_transform)

    @property
    def raw_file_names(self):
            '''if this file exists in raw_dir, the download is not triggered,
            the download func is not implemented here)'''
        return 'cate_id_01.csv'

    @property
    def processed_file_names(self):
        '''if these files are found in raw_dir, processing is skipped'''

        return 'not_implemented.pt'

    def download(self):
        # Download to `self.raw_dir`.
        #path = download_url(url, self.raw_dir)
        pass

    def process(self):
        self.data = pd.read_csv(self.raw_paths[0])
        for index, mol in tqdm(self.data.iterrows(), total =self.data.shape[0]):
            cate = df["categories"]
            categories_main = df["categories_main"]
            #get node features
            node_feats = self._get_node_features(cate)
            # get edge features
            edge_feats = self._get_edge_features(cate)
            # get adjancy infor
            edge_index = self._get_adjacency_info(cate)
            
            #get labels info
            label = self._get_labels(categories_main)
            
            # create data object
            data = Data(x=node_feats, 
                       edge_index = edge_index,
                       edge_attr = edge_feats,
                       y = label)
            torch.save(data,
                      os.path.join(sef.processed_dir,
                                  f'data_{index}.pt))
             # Create data object
            data = Data(x=node_feats, 
                        edge_index=edge_index,
                        edge_attr=edge_feats,
                        y=label,
                        smiles=mol["smiles"]
                        ) 
            if self.test:
                torch.save(data, 
                    os.path.join(self.processed_dir, 
                                 f'data_test_{index}.pt'))
            else:
                torch.save(data, 
                    os.path.join(self.processed_dir, 
                                 f'data_{index}.pt'))
   def _get_node_features(self, mol):
        """ 
        This will return a matrix / 2d array of the shape
        [Number of Nodes, Node Feature size]
        """
        all_node_feats = []

        for atom in mol.GetAtoms():
            node_feats = []
            # Feature 1: Atomic number        
            node_feats.append(atom.GetAtomicNum())
            # Feature 2: Atom degree
            node_feats.append(atom.GetDegree())
            # Feature 3: Formal charge
            node_feats.append(atom.GetFormalCharge())
            # Feature 4: Hybridization
            node_feats.append(atom.GetHybridization())
            # Feature 5: Aromaticity
            node_feats.append(atom.GetIsAromatic())
            # Feature 6: Total Num Hs
            node_feats.append(atom.GetTotalNumHs())
            # Feature 7: Radical Electrons
            node_feats.append(atom.GetNumRadicalElectrons())
            # Feature 8: In Ring
            node_feats.append(atom.IsInRing())
            # Feature 9: Chirality
            node_feats.append(atom.GetChiralTag())

            # Append node features to matrix
            all_node_feats.append(node_feats)

        all_node_feats = np.asarray(all_node_feats)
        return torch.tensor(all_node_feats, dtype=torch.float)

    def _get_edge_features(self, mol):
        """ 
        This will return a matrix / 2d array of the shape
        [Number of edges, Edge Feature size]
        """
        all_edge_feats = []

        for bond in mol.GetBonds():
            edge_feats = []
            # Feature 1: Bond type (as double)
            edge_feats.append(bond.GetBondTypeAsDouble())
            # Feature 2: Rings
            edge_feats.append(bond.IsInRing())
            # Append node features to matrix (twice, per direction)
            all_edge_feats += [edge_feats, edge_feats]

        all_edge_feats = np.asarray(all_edge_feats)
        return torch.tensor(all_edge_feats, dtype=torch.float)

    def _get_adjacency_info(self, mol):
        """
        We could also use rdmolops.GetAdjacencyMatrix(mol)
        but we want to be sure that the order of the indices
        matches the order of the edge features
        """
        edge_indices = []
        for bond in mol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()
            edge_indices += [[i, j], [j, i]]

        edge_indices = torch.tensor(edge_indices)
        edge_indices = edge_indices.t().to(torch.long).view(2, -1)
        return edge_indices

    def _get_labels(self, label):
        label = np.asarray([label])
        return torch.tensor(label, dtype=torch.int64)

    def len(self):
        return self.data.shape[0]

    def get(self, idx):
        """ - Equivalent to __getitem__ in pytorch
            - Is not needed for PyG's InMemoryDataset
        """
        if self.test:
            data = torch.load(os.path.join(self.processed_dir, 
                                 f'data_test_{idx}.pt'))
        else:
            data = torch.load(os.path.join(self.processed_dir, 
                                 f'data_{idx}.pt'))   
        return data
foreach (var m in Model.AllMeasures) {
  m.Description = m.Expression;
}
star

Sat Dec 24 2022 05:07:10 GMT+0000 (Coordinated Universal Time)

#array #dataset #html
star

Mon Sep 05 2022 09:55:05 GMT+0000 (Coordinated Universal Time)

#python #pandas #dataset #eda #outliers #boundaries
star

Mon Sep 05 2022 09:53:01 GMT+0000 (Coordinated Universal Time)

#python #pandas #dataset #eda #cardibality
star

Mon Sep 05 2022 09:49:27 GMT+0000 (Coordinated Universal Time)

#python #pandas #dataset #eda #missingdata
star

Wed Aug 10 2022 05:32:42 GMT+0000 (Coordinated Universal Time) http://localhost:8888/notebooks/titanic.ipynb

#jupyter #notebook #titanic #dataset
star

Fri Sep 10 2021 02:09:39 GMT+0000 (Coordinated Universal Time)

#dataset
star

Wed Sep 01 2021 09:04:08 GMT+0000 (Coordinated Universal Time)

#gnn #dataset
star

Wed Sep 01 2021 09:03:36 GMT+0000 (Coordinated Universal Time)

#gnn #dataset
star

Wed Nov 04 2020 01:25:15 GMT+0000 (Coordinated Universal Time) https://youtu.be/Wkap4yZmO9M

#dax #tabular.editor #live.dataset #ms.pbi #dataset

Save snippets that work with our extensions

Available in the Chrome Web Store Get Firefox Add-on Get VS Code extension