Snippets Collections
h_feats = 64
learn_iterations = 50
learn_rate = 0.01

model = EntityGraphModule(
    dataset.graphs[0].ndata["feat"].shape[1],
    dataset.graphs[0].edata["feat"].shape[1],
    h_feats,
    dataset.labels.max().item() + 1
)
optimizer = torch.optim.Adam(model.parameters(), lr=learn_rate)

for _ in range(learn_iterations):
    for batched_graph, labels in train_dataloader:
        pred = model(batched_graph, batched_graph.ndata["feat"].float(), batched_graph.edata["feat"].float())
        loss = F.cross_entropy(pred, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

num_correct = 0
num_tests = 0
for batched_graph, labels in test_dataloader:
    pred = model(batched_graph, batched_graph.ndata["feat"].float(), batched_graph.edata["feat"].float())
    num_correct += (pred.argmax(1) == labels).sum().item()
    num_tests += len(labels)

acc = num_correct / num_tests
print("Test accuracy:", acc)
from torch.utils.data.sampler import SubsetRandomSampler
from dgl.dataloading import GraphDataLoader

num_examples = len(dataset)
num_train = int(num_examples * 0.8)

train_sampler = SubsetRandomSampler(torch.arange(num_train))
test_sampler = SubsetRandomSampler(torch.arange(num_train, num_examples))

train_dataloader = GraphDataLoader(
    dataset, sampler=train_sampler, batch_size=5, drop_last=False
)
test_dataloader = GraphDataLoader(
    dataset, sampler=test_sampler, batch_size=5, drop_last=False
)
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn import NNConv, SAGEConv

class EntityGraphModule(nn.Module):
    def __init__(self, node_in_feats, edge_in_feats, h_feats, num_classes):
        super(EntityGraphModule, self).__init__()
        lin = nn.Linear(edge_in_feats, node_in_feats * h_feats)
        edge_func = lambda e_feat: lin(e_feat)
        self.conv1 = NNConv(node_in_feats, h_feats, edge_func)

        self.conv2 = SAGEConv(h_feats, num_classes, "pool")

    def forward(self, g, node_features, edge_features):
        h = self.conv1(g, node_features, edge_features)
        h = F.relu(h)
        h = self.conv2(g, h)
        g.ndata["h"] = h
        return dgl.mean_nodes(g, "h")
import os

os.environ["DGLBACKEND"] = "pytorch"
import pandas as pd
import torch
import dgl
from dgl.data import DGLDataset

class EntitiesDataset(DGLDataset):
    def __init__(self, entitiesFile):
        self.entitiesFile = entitiesFile
        super().__init__(name="entities")

    def process(self):
        entities = pd.read_json(self.entitiesFile, lines=1)

        self.graphs = []
        self.labels = []

        for _, entity in entities.iterrows():
            a = []
            b = []
            r1_feat = []
            r2_feat = []
            for edge in entity["edges"]:
                a.append(edge["a"])
                b.append(edge["b"])
                r1_feat.append(edge["R1"])
                r2_feat.append(edge["R2"])
            a = torch.LongTensor(a)
            b = torch.LongTensor(b)
            edge_features = torch.LongTensor([r1_feat, r2_feat]).t()

            node_feat = [[node["totalValue"], node["items"]] for node in entity["records"]]
            node_features = torch.tensor(node_feat)

            g = dgl.graph((a, b), num_nodes=len(entity["records"]))
            g.edata["feat"] = edge_features
            g.ndata["feat"] = node_features
            g = dgl.add_self_loop(g)

            self.graphs.append(g)
            self.labels.append(entity["fraud"])

        self.labels = torch.LongTensor(self.labels)

    def __getitem__(self, i):
        return self.graphs[i], self.labels[i]

    def __len__(self):
        return len(self.graphs)

dataset = EntitiesDataset("./entities.jsonl")
print(dataset)
print(dataset[0])
{
  "fraud":1,
  "records":[
    {
      "id":0,
      "totalValue":85,
      "items":5
    }
  ],
  "edges":[
    
  ]
}
{
  "fraud":1,
  "records":[
    {
      "id":0,
      "totalValue":85,
      "items":2
    },
    {
      "id":1,
      "totalValue":31,
      "items":4
    },
    {
      "id":2,
      "totalValue":20,
      "items":9
    }
  ],
  "edges":[
    {
      "a":1,
      "b":0,
      "R1":1,
      "R2":1
    },
    {
      "a":2,
      "b":1,
      "R1":0,
      "R2":1
    }
  ]
}
made 1052 comparisons and found 6 matches and 2 entities
found the following entity: Ivan Smxth, Ixan Smith, Ivax Smitx from Cairo
found the following entity: Brxan Williams, Brian Williams from Cape Town
func compare(records []Record) (comparisons int, edges [][2]int) {
	for _, a := range records {
		for _, b := range records {
			if a == b {
				continue // don't compare with itself
			}
			comparisons++
			if matches(a, b) {
				edges = append(edges, [2]int{a.ID, b.ID})
			}
		}
	}
	return comparisons, edges
}

func connectedComponents(edges [][2]int) [][]int {
	components := map[int][]int{}
	nextIdx := 0
	idx := map[int]int{}

	for _, edge := range edges {
		a := edge[0]
		b := edge[1]
		aIdx, aOk := idx[a]
		bIdx, bOk := idx[b]
		switch {
		case aOk && bOk && aIdx == bIdx: // in same component
			continue
		case aOk && bOk && aIdx != bIdx: // merge two components
			components[nextIdx] = append(components[aIdx], components[bIdx]...)
			delete(components, aIdx)
			delete(components, bIdx)
			for _, x := range components[nextIdx] {
				idx[x] = nextIdx
			}
			nextIdx++
		case aOk && !bOk: // add b to component of a
			idx[b] = aIdx
			components[aIdx] = append(components[aIdx], b)
		case bOk && !aOk: // add a to component of b
			idx[a] = bIdx
			components[bIdx] = append(components[bIdx], a)
		default: // create new component with a and b
			idx[a] = nextIdx
			idx[b] = nextIdx
			components[nextIdx] = []int{a, b}
			nextIdx++
		}
	}

	cc := make([][]int, len(components))
	i := 0
	for k := range components {
		cc[i] = components[k]
		i++
	}
	return cc
}

func main() {
	records := loadRecords(100)
	blocks := block(records)
	comparisons := 0
	edges := [][2]int{}
	for _, blockRecords := range blocks {
		c, e := compare(blockRecords)
		comparisons += c
		edges = append(edges, e...)
	}
	cc := connectedComponents(edges)

	fmt.Printf("made %d comparisons and found %d matches and %d entities\n", comparisons, len(edges), len(cc))
	for _, component := range cc {
		names := make([]string, len(component))
		for i, id := range component {
			names[i] = records[id].Name
		}
		fmt.Printf("found the following entity: %s from %s\n", strings.Join(names, ", "), records[component[0]].City)
	}
}
func block(records []Record) map[string][]Record {
	blocks := map[string][]Record{}
	for _, record := range records {
		blocks[record.City] = append(blocks[record.City], record)
	}
	return blocks
}

func main() {
	records := loadRecords(100)
	blocks := block(records)
	comparisons := 0
	matchCount := 0
	for _, blockRecords := range blocks {
		c, m := compare(blockRecords)
		comparisons += c
		matchCount += m
	}

	fmt.Printf("made %d comparisons and found %d matches\n", comparisons, matchCount)
}
Daisy Williams and Dave Williams are probably the same person
Deborax Browx and Debra Brown are probably the same person
Riley Brown and RxxeyxBrown are probably the same person
Dan Willxams and Dave Williams are probably the same person
made 9900 comparisons and found 16 matches
var firstNames = [...]string{"Wade", "Dave", "Seth", "Ivan", "Riley", "Gilbert", "Jorge", "Dan", "Brian", "Roberto", "Daisy", "Deborah", "Isabel", "Stella", "Debra", "Berverly", "Vera", "Angela", "Lucy", "Lauren"}
var lastNames = [...]string{"Smith", "Jones", "Williams", "Brown", "Taylor"}

func randomName() string {
	fn := firstNames[rand.Intn(len(firstNames))]
	ln := lastNames[rand.Intn(len(lastNames))]
	name := []byte(fmt.Sprintf("%s %s", fn, ln))
	errors := rand.Intn(4)
	for i := 0; i < errors; i++ {
		name[rand.Intn(len(name))] = 'x'
	}
	return string(name)
}

var cities = [...]string{"Paris", "Berlin", "New York", "Amsterdam", "Shanghai", "San Francisco", "Sydney", "Cape Town", "Brasilia", "Cairo"}

func randomCity() string {
	return cities[rand.Intn(len(cities))]
}

func loadRecords(n int) []Record {
	records := make([]Record, n)
	for i := 0; i < n; i++ {
		records[i] = Record{
			ID:   i,
			Name: randomName(),
			City: randomCity(),
		}
	}
	return records
}

func compare(records []Record) (comparisons, matchCount int) {
	for _, a := range records {
		for _, b := range records {
			if a == b {
				continue // don't compare with itself
			}
			comparisons++
			if matches(a, b) {
				fmt.Printf("%s and %s are probably the same person\n", a.Name, b.Name)
				matchCount++
			}
		}
	}
	return comparisons, matchCount
}

func main() {
	records := loadRecords(100)
	comparisons, matchCount := compare(records)

	fmt.Printf("made %d comparisons and found %d matches\n", comparisons, matchCount)
}
package main

import (
	"fmt"

	"github.com/hbollon/go-edlib"
)

type Record struct {
	ID int
	Name string
	City string
}

func matches(a, b Record) bool {
	distance := edlib.LevenshteinDistance(a.Name, b.Name)
	return distance <= 3 && a.City == b.City
}

func main() {
	a := Record{
		Name: "Vincent Van Gogh",
		City: "Paris",
	}
	b := Record{
		Name: "Vince Van Gough",
		City: "Paris",
	}
	if matches(a, b) {
		fmt.Printf("%s and %s are probably the same person\n", a.Name, b.Name)
	} else {
		fmt.Printf("%s and %s are probably not the same person\n", a.Name, b.Name)
	}
}
star

Tue Aug 22 2023 13:17:31 GMT+0000 (Coordinated Universal Time)

#entityresolution #frauddetection #gnn #machinelearning
star

Tue Aug 22 2023 13:16:36 GMT+0000 (Coordinated Universal Time)

#entityresolution #frauddetection #gnn #machinelearning
star

Tue Aug 22 2023 13:15:50 GMT+0000 (Coordinated Universal Time)

#entityresolution #frauddetection #gnn #machinelearning
star

Tue Aug 22 2023 13:14:50 GMT+0000 (Coordinated Universal Time)

#entityresolution #frauddetection #gnn #machinelearning
star

Tue Aug 22 2023 13:12:57 GMT+0000 (Coordinated Universal Time)

#entityresolution #frauddetection #gnn #machinelearning
star

Tue Aug 22 2023 13:10:59 GMT+0000 (Coordinated Universal Time)

#entityresolution #frauddetection #gnn #machinelearning
star

Fri Aug 11 2023 09:19:43 GMT+0000 (Coordinated Universal Time)

#entityresolution #fuzzymatching #connectedcomponents
star

Fri Aug 11 2023 09:16:23 GMT+0000 (Coordinated Universal Time)

#entityresolution #fuzzymatching #connectedcomponents
star

Fri Aug 11 2023 09:13:16 GMT+0000 (Coordinated Universal Time)

#entityresolution #fuzzymatching
star

Fri Aug 11 2023 09:10:13 GMT+0000 (Coordinated Universal Time)

#entityresolution #fuzzymatching
star

Fri Aug 11 2023 09:07:34 GMT+0000 (Coordinated Universal Time)

#entityresolution #fuzzymatching
star

Fri Aug 11 2023 09:00:29 GMT+0000 (Coordinated Universal Time)

#entityresolution #fuzzymatching

Save snippets that work with our extensions

Available in the Chrome Web Store Get Firefox Add-on Get VS Code extension