thiscodeWorks - Organizing the best of code online

[
  "y1:r1:R1",
  "y1:r2:R1",
  "y2:r1:R1",
  "y2:r2:R1",
  "r4:p1:R1",
  "r5:p1:R1",
  "r5:b1:R1",
  "b2:p1:R1",
  "y3:b5:R1",
  "y3:b6:R1"
]

#entityresolution #cbgc

CBGC: Complex Cliques Compressed

{
  "R1": [
    ["y1", "y2", "y3"],
    ["r1", "r2", "r3", "r4", "r5"],
    ["b1", "b2", "b3", "b4", "b5", "b6"]
  ]
}

#entityresolution #cbgc

CBGC: 6 Node Clique Compressed

{
  "R1":[
    ["a", "b", "c", "d", "e", "f"]
  ]
}

#entityresolution #cbgc

CBGC: Triangle Compressed

{
  "R1": [
    ["a", "b", "c"]
  ]
}

#entityresolution #cbgc

CBGC: Edge List

[
  "a:b:R1",
  "a:c:R1",
  "b:c:R1"
]

#entityresolution #frauddetection #gnn #machinelearning

Training and Testing

h_feats = 64
learn_iterations = 50
learn_rate = 0.01

model = EntityGraphModule(
    dataset.graphs[0].ndata["feat"].shape[1],
    dataset.graphs[0].edata["feat"].shape[1],
    h_feats,
    dataset.labels.max().item() + 1
)
optimizer = torch.optim.Adam(model.parameters(), lr=learn_rate)

for _ in range(learn_iterations):
    for batched_graph, labels in train_dataloader:
        pred = model(batched_graph, batched_graph.ndata["feat"].float(), batched_graph.edata["feat"].float())
        loss = F.cross_entropy(pred, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

num_correct = 0
num_tests = 0
for batched_graph, labels in test_dataloader:
    pred = model(batched_graph, batched_graph.ndata["feat"].float(), batched_graph.edata["feat"].float())
    num_correct += (pred.argmax(1) == labels).sum().item()
    num_tests += len(labels)

acc = num_correct / num_tests
print("Test accuracy:", acc)

#entityresolution #frauddetection #gnn #machinelearning

Data Loader

from torch.utils.data.sampler import SubsetRandomSampler
from dgl.dataloading import GraphDataLoader

num_examples = len(dataset)
num_train = int(num_examples * 0.8)

train_sampler = SubsetRandomSampler(torch.arange(num_train))
test_sampler = SubsetRandomSampler(torch.arange(num_train, num_examples))

train_dataloader = GraphDataLoader(
    dataset, sampler=train_sampler, batch_size=5, drop_last=False
)
test_dataloader = GraphDataLoader(
    dataset, sampler=test_sampler, batch_size=5, drop_last=False
)

#entityresolution #frauddetection #gnn #machinelearning

Entity Graph Module

import torch.nn as nn
import torch.nn.functional as F
from dgl.nn import NNConv, SAGEConv

class EntityGraphModule(nn.Module):
    def __init__(self, node_in_feats, edge_in_feats, h_feats, num_classes):
        super(EntityGraphModule, self).__init__()
        lin = nn.Linear(edge_in_feats, node_in_feats * h_feats)
        edge_func = lambda e_feat: lin(e_feat)
        self.conv1 = NNConv(node_in_feats, h_feats, edge_func)

        self.conv2 = SAGEConv(h_feats, num_classes, "pool")

    def forward(self, g, node_features, edge_features):
        h = self.conv1(g, node_features, edge_features)
        h = F.relu(h)
        h = self.conv2(g, h)
        g.ndata["h"] = h
        return dgl.mean_nodes(g, "h")

#entityresolution #frauddetection #gnn #machinelearning

Entities Dataset

import os

os.environ["DGLBACKEND"] = "pytorch"
import pandas as pd
import torch
import dgl
from dgl.data import DGLDataset

class EntitiesDataset(DGLDataset):
    def __init__(self, entitiesFile):
        self.entitiesFile = entitiesFile
        super().__init__(name="entities")

    def process(self):
        entities = pd.read_json(self.entitiesFile, lines=1)

        self.graphs = []
        self.labels = []

        for _, entity in entities.iterrows():
            a = []
            b = []
            r1_feat = []
            r2_feat = []
            for edge in entity["edges"]:
                a.append(edge["a"])
                b.append(edge["b"])
                r1_feat.append(edge["R1"])
                r2_feat.append(edge["R2"])
            a = torch.LongTensor(a)
            b = torch.LongTensor(b)
            edge_features = torch.LongTensor([r1_feat, r2_feat]).t()

            node_feat = [[node["totalValue"], node["items"]] for node in entity["records"]]
            node_features = torch.tensor(node_feat)

            g = dgl.graph((a, b), num_nodes=len(entity["records"]))
            g.edata["feat"] = edge_features
            g.ndata["feat"] = node_features
            g = dgl.add_self_loop(g)

            self.graphs.append(g)
            self.labels.append(entity["fraud"])

        self.labels = torch.LongTensor(self.labels)

    def __getitem__(self, i):
        return self.graphs[i], self.labels[i]

    def __len__(self):
        return len(self.graphs)

dataset = EntitiesDataset("./entities.jsonl")
print(dataset)
print(dataset[0])

#entityresolution #frauddetection #gnn #machinelearning

Example Record w/o Edges

{
  "fraud":1,
  "records":[
    {
      "id":0,
      "totalValue":85,
      "items":5
    }
  ],
  "edges":[
    
  ]
}

#entityresolution #frauddetection #gnn #machinelearning

Example Record

{
  "fraud":1,
  "records":[
    {
      "id":0,
      "totalValue":85,
      "items":2
    },
    {
      "id":1,
      "totalValue":31,
      "items":4
    },
    {
      "id":2,
      "totalValue":20,
      "items":9
    }
  ],
  "edges":[
    {
      "a":1,
      "b":0,
      "R1":1,
      "R2":1
    },
    {
      "a":2,
      "b":1,
      "R1":0,
      "R2":1
    }
  ]
}

#entityresolution #fuzzymatching #connectedcomponents

Connected Components Output

made 1052 comparisons and found 6 matches and 2 entities
found the following entity: Ivan Smxth, Ixan Smith, Ivax Smitx from Cairo
found the following entity: Brxan Williams, Brian Williams from Cape Town

#entityresolution #fuzzymatching #connectedcomponents

Connected Components

func compare(records []Record) (comparisons int, edges [][2]int) {
	for _, a := range records {
		for _, b := range records {
			if a == b {
				continue // don't compare with itself
			}
			comparisons++
			if matches(a, b) {
				edges = append(edges, [2]int{a.ID, b.ID})
			}
		}
	}
	return comparisons, edges
}

func connectedComponents(edges [][2]int) [][]int {
	components := map[int][]int{}
	nextIdx := 0
	idx := map[int]int{}

	for _, edge := range edges {
		a := edge[0]
		b := edge[1]
		aIdx, aOk := idx[a]
		bIdx, bOk := idx[b]
		switch {
		case aOk && bOk && aIdx == bIdx: // in same component
			continue
		case aOk && bOk && aIdx != bIdx: // merge two components
			components[nextIdx] = append(components[aIdx], components[bIdx]...)
			delete(components, aIdx)
			delete(components, bIdx)
			for _, x := range components[nextIdx] {
				idx[x] = nextIdx
			}
			nextIdx++
		case aOk && !bOk: // add b to component of a
			idx[b] = aIdx
			components[aIdx] = append(components[aIdx], b)
		case bOk && !aOk: // add a to component of b
			idx[a] = bIdx
			components[bIdx] = append(components[bIdx], a)
		default: // create new component with a and b
			idx[a] = nextIdx
			idx[b] = nextIdx
			components[nextIdx] = []int{a, b}
			nextIdx++
		}
	}

	cc := make([][]int, len(components))
	i := 0
	for k := range components {
		cc[i] = components[k]
		i++
	}
	return cc
}

func main() {
	records := loadRecords(100)
	blocks := block(records)
	comparisons := 0
	edges := [][2]int{}
	for _, blockRecords := range blocks {
		c, e := compare(blockRecords)
		comparisons += c
		edges = append(edges, e...)
	}
	cc := connectedComponents(edges)

	fmt.Printf("made %d comparisons and found %d matches and %d entities\n", comparisons, len(edges), len(cc))
	for _, component := range cc {
		names := make([]string, len(component))
		for i, id := range component {
			names[i] = records[id].Name
		}
		fmt.Printf("found the following entity: %s from %s\n", strings.Join(names, ", "), records[component[0]].City)
	}
}

#entityresolution #fuzzymatching

ER Blocking

func block(records []Record) map[string][]Record {
	blocks := map[string][]Record{}
	for _, record := range records {
		blocks[record.City] = append(blocks[record.City], record)
	}
	return blocks
}

func main() {
	records := loadRecords(100)
	blocks := block(records)
	comparisons := 0
	matchCount := 0
	for _, blockRecords := range blocks {
		c, m := compare(blockRecords)
		comparisons += c
		matchCount += m
	}

	fmt.Printf("made %d comparisons and found %d matches\n", comparisons, matchCount)
}

#entityresolution #fuzzymatching

Naive Entity Resolution Output

Daisy Williams and Dave Williams are probably the same person
Deborax Browx and Debra Brown are probably the same person
Riley Brown and RxxeyxBrown are probably the same person
Dan Willxams and Dave Williams are probably the same person
made 9900 comparisons and found 16 matches

#entityresolution #fuzzymatching

Naive Entity Resolution

var firstNames = [...]string{"Wade", "Dave", "Seth", "Ivan", "Riley", "Gilbert", "Jorge", "Dan", "Brian", "Roberto", "Daisy", "Deborah", "Isabel", "Stella", "Debra", "Berverly", "Vera", "Angela", "Lucy", "Lauren"}
var lastNames = [...]string{"Smith", "Jones", "Williams", "Brown", "Taylor"}

func randomName() string {
	fn := firstNames[rand.Intn(len(firstNames))]
	ln := lastNames[rand.Intn(len(lastNames))]
	name := []byte(fmt.Sprintf("%s %s", fn, ln))
	errors := rand.Intn(4)
	for i := 0; i < errors; i++ {
		name[rand.Intn(len(name))] = 'x'
	}
	return string(name)
}

var cities = [...]string{"Paris", "Berlin", "New York", "Amsterdam", "Shanghai", "San Francisco", "Sydney", "Cape Town", "Brasilia", "Cairo"}

func randomCity() string {
	return cities[rand.Intn(len(cities))]
}

func loadRecords(n int) []Record {
	records := make([]Record, n)
	for i := 0; i < n; i++ {
		records[i] = Record{
			ID:   i,
			Name: randomName(),
			City: randomCity(),
		}
	}
	return records
}

func compare(records []Record) (comparisons, matchCount int) {
	for _, a := range records {
		for _, b := range records {
			if a == b {
				continue // don't compare with itself
			}
			comparisons++
			if matches(a, b) {
				fmt.Printf("%s and %s are probably the same person\n", a.Name, b.Name)
				matchCount++
			}
		}
	}
	return comparisons, matchCount
}

func main() {
	records := loadRecords(100)
	comparisons, matchCount := compare(records)

	fmt.Printf("made %d comparisons and found %d matches\n", comparisons, matchCount)
}

#entityresolution #fuzzymatching

Fuzzy Matching

package main

import (
	"fmt"

	"github.com/hbollon/go-edlib"
)

type Record struct {
	ID int
	Name string
	City string
}

func matches(a, b Record) bool {
	distance := edlib.LevenshteinDistance(a.Name, b.Name)
	return distance <= 3 && a.City == b.City
}

func main() {
	a := Record{
		Name: "Vincent Van Gogh",
		City: "Paris",
	}
	b := Record{
		Name: "Vince Van Gough",
		City: "Paris",
	}
	if matches(a, b) {
		fmt.Printf("%s and %s are probably the same person\n", a.Name, b.Name)
	} else {
		fmt.Printf("%s and %s are probably not the same person\n", a.Name, b.Name)
	}
}

CBGC: Complex Graph Remaining Edges

CBGC: Complex Cliques Compressed

CBGC: 6 Node Clique Compressed

CBGC: Triangle Compressed

CBGC: Edge List

Training and Testing

Data Loader

Entity Graph Module

Entities Dataset

Example Record w/o Edges

Example Record

Connected Components Output

Connected Components

ER Blocking

Naive Entity Resolution Output

Naive Entity Resolution

Fuzzy Matching

Save snippets that work with our extensions