GNN dataset create2

PHOTO

Wed Sep 01 2021 09:04:08 GMT+0000 (Coordinated Universal Time)

import torch
from torch_geometric.data import InMemoryDataset
from tqdm import tqdm

class classDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(classDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []
    @property
    def processed_file_names(self):
        return ['./train_vec.csv']

    def download(self):
        pass
    
    def process(self):
        
        data_list = []
        # process by label_vec
        # treat each email in a label categories as a node, and therefore all emails in the same label form a graph
        grouped = df.groupby('label_vec') # group the preprocessed data by label_vec and iterate over these groups (use for)
        for label_vec, group in tqdm(grouped):
            # each iteration, the text_vec in each group are categorically encoded again
            label_email_id = LabelEncoder().fit_transform(group.text_vec) # since for each graph, the node index should count from 0. 
            group = group.reset_index(drop=True)
            group['label_email_id'] = label_email_id
            node_features = group.loc[group.label_vec==label_vec,['label_email_id','text_vec']].sort_values('label_email_id').label_vec.drop_duplicates().values

            node_features = torch.LongTensor(node_features).unsqueeze(1)
            target_nodes = group.label_email_id.values[1:]
            source_nodes = group.label_email_id.values[:-1]

            edge_index = torch.tensor([source_nodes,
                                   target_nodes], dtype=torch.long)
            x = node_features

            y = torch.FloatTensor([group.label_vec.values[0]]).type(torch.LongTensor)

            data = Data(x=x, edge_index=edge_index, y=y)
            data_list.append(data)
        
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

COPY

Save snippets that work from anywhere online with our extensions

Comments

GNNs

@QuinnFox12

A detailed example of data loaders with PyTorch A detailed example of data loaders with PyTorch colab common useful snippets GNN dataset create1 GNN dataset create2

#dax #tabular.editor #live.dataset #ms.pbi #dataset

Showing DAX Expressions When Using Power BI Datasets

foreach (var m in Model.AllMeasures) {
  m.Description = m.Expression;
}

#gnn #dataset

GNN dataset create1

class MyOwnDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None):
        '''Root: where the dataset should be store, this folder is split into raw_dir (downloaded dataset)
        and processed_dir (processed data)'''
        super(cateDataset, self).__init__(root, transform, pre_transform)

    @property
    def raw_file_names(self):
            '''if this file exists in raw_dir, the download is not triggered,
            the download func is not implemented here)'''
        return 'cate_id_01.csv'

    @property
    def processed_file_names(self):
        '''if these files are found in raw_dir, processing is skipped'''

        return 'not_implemented.pt'

    def download(self):
        # Download to `self.raw_dir`.
        #path = download_url(url, self.raw_dir)
        pass

    def process(self):
        self.data = pd.read_csv(self.raw_paths[0])
        for index, mol in tqdm(self.data.iterrows(), total =self.data.shape[0]):
            cate = df["categories"]
            categories_main = df["categories_main"]
            #get node features
            node_feats = self._get_node_features(cate)
            # get edge features
            edge_feats = self._get_edge_features(cate)
            # get adjancy infor
            edge_index = self._get_adjacency_info(cate)
            
            #get labels info
            label = self._get_labels(categories_main)
            
            # create data object
            data = Data(x=node_feats, 
                       edge_index = edge_index,
                       edge_attr = edge_feats,
                       y = label)
            torch.save(data,
                      os.path.join(sef.processed_dir,
                                  f'data_{index}.pt))
             # Create data object
            data = Data(x=node_feats, 
                        edge_index=edge_index,
                        edge_attr=edge_feats,
                        y=label,
                        smiles=mol["smiles"]
                        ) 
            if self.test:
                torch.save(data, 
                    os.path.join(self.processed_dir, 
                                 f'data_test_{index}.pt'))
            else:
                torch.save(data, 
                    os.path.join(self.processed_dir, 
                                 f'data_{index}.pt'))
   def _get_node_features(self, mol):
        """ 
        This will return a matrix / 2d array of the shape
        [Number of Nodes, Node Feature size]
        """
        all_node_feats = []

        for atom in mol.GetAtoms():
            node_feats = []
            # Feature 1: Atomic number        
            node_feats.append(atom.GetAtomicNum())
            # Feature 2: Atom degree
            node_feats.append(atom.GetDegree())
            # Feature 3: Formal charge
            node_feats.append(atom.GetFormalCharge())
            # Feature 4: Hybridization
            node_feats.append(atom.GetHybridization())
            # Feature 5: Aromaticity
            node_feats.append(atom.GetIsAromatic())
            # Feature 6: Total Num Hs
            node_feats.append(atom.GetTotalNumHs())
            # Feature 7: Radical Electrons
            node_feats.append(atom.GetNumRadicalElectrons())
            # Feature 8: In Ring
            node_feats.append(atom.IsInRing())
            # Feature 9: Chirality
            node_feats.append(atom.GetChiralTag())

            # Append node features to matrix
            all_node_feats.append(node_feats)

        all_node_feats = np.asarray(all_node_feats)
        return torch.tensor(all_node_feats, dtype=torch.float)

    def _get_edge_features(self, mol):
        """ 
        This will return a matrix / 2d array of the shape
        [Number of edges, Edge Feature size]
        """
        all_edge_feats = []

        for bond in mol.GetBonds():
            edge_feats = []
            # Feature 1: Bond type (as double)
            edge_feats.append(bond.GetBondTypeAsDouble())
            # Feature 2: Rings
            edge_feats.append(bond.IsInRing())
            # Append node features to matrix (twice, per direction)
            all_edge_feats += [edge_feats, edge_feats]

        all_edge_feats = np.asarray(all_edge_feats)
        return torch.tensor(all_edge_feats, dtype=torch.float)

    def _get_adjacency_info(self, mol):
        """
        We could also use rdmolops.GetAdjacencyMatrix(mol)
        but we want to be sure that the order of the indices
        matches the order of the edge features
        """
        edge_indices = []
        for bond in mol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()
            edge_indices += [[i, j], [j, i]]

        edge_indices = torch.tensor(edge_indices)
        edge_indices = edge_indices.t().to(torch.long).view(2, -1)
        return edge_indices

    def _get_labels(self, label):
        label = np.asarray([label])
        return torch.tensor(label, dtype=torch.int64)

    def len(self):
        return self.data.shape[0]

    def get(self, idx):
        """ - Equivalent to __getitem__ in pytorch
            - Is not needed for PyG's InMemoryDataset
        """
        if self.test:
            data = torch.load(os.path.join(self.processed_dir, 
                                 f'data_test_{idx}.pt'))
        else:
            data = torch.load(os.path.join(self.processed_dir, 
                                 f'data_{idx}.pt'))   
        return data

#gnn #dataset

GNN dataset create2

import torch
from torch_geometric.data import InMemoryDataset
from tqdm import tqdm

class classDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(classDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []
    @property
    def processed_file_names(self):
        return ['./train_vec.csv']

    def download(self):
        pass
    
    def process(self):
        
        data_list = []
        # process by label_vec
        # treat each email in a label categories as a node, and therefore all emails in the same label form a graph
        grouped = df.groupby('label_vec') # group the preprocessed data by label_vec and iterate over these groups (use for)
        for label_vec, group in tqdm(grouped):
            # each iteration, the text_vec in each group are categorically encoded again
            label_email_id = LabelEncoder().fit_transform(group.text_vec) # since for each graph, the node index should count from 0. 
            group = group.reset_index(drop=True)
            group['label_email_id'] = label_email_id
            node_features = group.loc[group.label_vec==label_vec,['label_email_id','text_vec']].sort_values('label_email_id').label_vec.drop_duplicates().values

            node_features = torch.LongTensor(node_features).unsqueeze(1)
            target_nodes = group.label_email_id.values[1:]
            source_nodes = group.label_email_id.values[:-1]

            edge_index = torch.tensor([source_nodes,
                                   target_nodes], dtype=torch.long)
            x = node_features

            y = torch.FloatTensor([group.label_vec.values[0]]).type(torch.LongTensor)

            data = Data(x=x, edge_index=edge_index, y=y)
            data_list.append(data)
        
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

#dataset

multi txt to pandas

import os
import re
import codecs
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

print("Please Enter the Exact Dataset Main Folder Name")
folder_name = input()

labels = os.listdir ("original_datasets/" + folder_name)
if ".DS_Store" in labels:
    labels.remove(".DS_Store")

all_data = []

# uncomment to debug
# labels = labels[:5]
counter = 0

for i in labels:
    instances_in_a_label = os.listdir ("original_datasets/" + folder_name + '/' + i)
    all_data_for_a_label = []
    for j in instances_in_a_label:
    # uncomment to debug
    #   if counter < 2:
        f = open("original_datasets/" + folder_name + '/' + i + '/' + j, "r", encoding='latin-1')
        raw_data = f.read()
        preprocessed_data = re.sub('[^a-zA-Z]', ' ', raw_data).lower()
        preprocessed_data = preprocessed_data.split()
        preprocessed_data = [word for word in preprocessed_data if word not in stopwords.words('english')]
        preprocessed_data = ' '.join(preprocessed_data)
        all_data.append([j, preprocessed_data, i])
    #        counter += 1
    # counter = 0

all_data = np.asarray(all_data)
df = pd.DataFrame(all_data)
print("===========DataFrame-Complete===========")

df.to_csv('pre_processed_df/pre_processed_' + folder_name + '.csv', index=False)

#jupyter #notebook #titanic #dataset

titanic - Jupyter Notebook

cat_type = CategoricalDtype(categories=['3', '2', '1'], ordered=True)
cat_type2 = CategoricalDtype(categories=['Kind','Jong','Middelbaar','Oud'], ordered=True)

df1['pclass'] = df1['pclass'].map({1: '1', 2: '2', 3: '3'})
df1['survived'] = df1['survived'].map({1: True, 0: False})
df1['sex'] = df1['sex'].map({'male': 'M', 'female': 'V'})
df1['pclass'] = df1['pclass'].astype(cat_type)
df1['sex'] = df1['sex'].astype('category')
df1['age'] = df1['age'].map(lambda x: round(x))
df1['age'] = df1['age'].astype('int8')
df1['fare'] = df1['fare'].map(lambda x: round(x, 2))
df1['age_cat'] = pd.cut(df1['age'], bins=4, labels=('Kind','Jong','Middelbaar','Oud'))
df1['age_cat'] = df1['age_cat'].astype(cat_type2)
df1 = df1.filter(items=['pclass', 'name', 'survived', 'sex', 'age', 'age_cat', 'fare'])
df1

#python #pandas #dataset #numerical #categorical #eda

Printing, listing and counting the numerical or categorical features in a dataset Pandas

# Get the Numerical Data list to infer distribution plots

numerical = [var for var in df.columns if df[var].dtype!='O'] 
print('There are {} numerical variables\n'.format(len(numerical))) 
print('The numerical variables are :', numerical)

# Get the Categorical Data list to infer distribution plots

categorical = [var for var in df.columns if df[var].dtype =='O'] 
print('There are {} Categorical variables\n'.format(len(categorical))) 
print('The Categorical variables are :', categorical)

#python #pandas #dataset #eda #missingdata

Quantifying Missing data with charts

data.isnull().mean().plot.bar(figsize=(12,6))
plt.ylabel('Percentage of missing values')
plt.xlabel('Variables')
plt.title('Quantifying missing data')

#python #pandas #dataset #eda #cardibality

Quantifying cardinality In datasets

data.nunique().plot.bar(figsize=(12,6))
plt.ylabel('Number of unique categories')
plt.xlabel('Variables')
plt.title('Cardinality')

## Version with 5% threshold

fig = label_freq.sort_values(ascending=False).plot.bar()
fig.axhline(y=0.05, color='red')
fig.set_ylabel('percentage of cars within each category')
fig.set_xlabel('Variable: class')
fig.set_title('Identifying Rare Categories')
plt.show()

#python #pandas #dataset #eda #outliers #boundaries

Function to find boundaries for outliers

def find_boundaries(df, variable, distance=1.5):

    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)

    lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
    upper_boundary = df[variable].quantile(0.75) + (IQR * distance)

    return upper_boundary, lower_boundary

#python #pandas #dataset #eda #extractyear #datetime

Extract the year from a date data In pandas

df['year'] = df['date'].dt.year if we put dt.month is for month etc..
df.head()

#array #dataset #html

Add an array to a HTML dataset

import "./styles.css";

const date = new Date();
const root = document.getElementById("app");
root.setAttribute("data-posts", "[]");

const items = [
  {
    title: "book 1",
    author: "David",
  },
  {
    title: "book 2",
    author: "Kevin",
  }
];

const posts = () => {
  setTimeout(() => {
    let outPut = `<ul>`;
    for (let item of items) {
      outPut += `<li>${item.title}</li>`;
    }
    outPut += `</ul>`;
    root.innerHTML = outPut;
  }, 500);
};



const createPost = (post, callback) => {
  const item = {
    title: post.title,
    author: post.author,
  };

  items.push(item); // push the item to the items array
  callback(); //evoke the callback function

// set the posts data attribute to take in all the items
  root.dataset.posts = JSON.stringify(items);
};

console.log(
  createPost({ title: "book 3", author: "Derek"}, posts)
);

#gnn #pytorch #trainset #loaddata

A detailed example of data loaders with PyTorch

# Load entire dataset
X, y = torch.load('some_training_set_with_labels.pt')

# Train model
for epoch in range(max_epochs):
    for i in range(n_batches):
        # Local batches and labels
        local_X, local_y = X[i*n_batches:(i+1)*n_batches,], y[i*n_batches:(i+1)*n_batches,]

        # Your model
        [...]

#gnn #pytorch #loaddata #trainset

A detailed example of data loaders with PyTorch

# Unoptimized generator
training_generator = SomeSingleCoreGenerator('some_training_set_with_labels.pt')

# Train model
for epoch in range(max_epochs):
    for local_X, local_y in training_generator:
        # Your model
        [...]

#textpreprocessing #nlp #gnn #dataload #pytorch

A detailed example of data loaders with PyTorch

# Load entire dataset
X, y = torch.load('some_training_set_with_labels.pt')
 
# Train model
for epoch in range(max_epochs):
    for i in range(n_batches):
        # Local batches and labels
        local_X, local_y = X[i*n_batches:(i+1)*n_batches,], y[i*n_batches:(i+1)*n_batches,]
 
        # Your model
        [...]
         
         
# other
# Unoptimized generator
training_generator = SomeSingleCoreGenerator('some_training_set_with_labels.pt')
 
# Train model
for epoch in range(max_epochs):
    for local_X, local_y in training_generator:
        # Your model
        [...]

#entityresolution #frauddetection #gnn #machinelearning

Example Record

{
  "fraud":1,
  "records":[
    {
      "id":0,
      "totalValue":85,
      "items":2
    },
    {
      "id":1,
      "totalValue":31,
      "items":4
    },
    {
      "id":2,
      "totalValue":20,
      "items":9
    }
  ],
  "edges":[
    {
      "a":1,
      "b":0,
      "R1":1,
      "R2":1
    },
    {
      "a":2,
      "b":1,
      "R1":0,
      "R2":1
    }
  ]
}

GNN dataset create2

Save snippets that work from anywhere online with our extensions

Comments

More like this

GNNs

Browse more snippets >>

GNN dataset create2

Save snippets that work from anywhere online with our extensions

Comments

More like this

GNNs

Browse more snippets >>

Embed code snippet