Snippets Collections
made 1052 comparisons and found 6 matches and 2 entities
found the following entity: Ivan Smxth, Ixan Smith, Ivax Smitx from Cairo
found the following entity: Brxan Williams, Brian Williams from Cape Town
func compare(records []Record) (comparisons int, edges [][2]int) {
	for _, a := range records {
		for _, b := range records {
			if a == b {
				continue // don't compare with itself
			}
			comparisons++
			if matches(a, b) {
				edges = append(edges, [2]int{a.ID, b.ID})
			}
		}
	}
	return comparisons, edges
}

func connectedComponents(edges [][2]int) [][]int {
	components := map[int][]int{}
	nextIdx := 0
	idx := map[int]int{}

	for _, edge := range edges {
		a := edge[0]
		b := edge[1]
		aIdx, aOk := idx[a]
		bIdx, bOk := idx[b]
		switch {
		case aOk && bOk && aIdx == bIdx: // in same component
			continue
		case aOk && bOk && aIdx != bIdx: // merge two components
			components[nextIdx] = append(components[aIdx], components[bIdx]...)
			delete(components, aIdx)
			delete(components, bIdx)
			for _, x := range components[nextIdx] {
				idx[x] = nextIdx
			}
			nextIdx++
		case aOk && !bOk: // add b to component of a
			idx[b] = aIdx
			components[aIdx] = append(components[aIdx], b)
		case bOk && !aOk: // add a to component of b
			idx[a] = bIdx
			components[bIdx] = append(components[bIdx], a)
		default: // create new component with a and b
			idx[a] = nextIdx
			idx[b] = nextIdx
			components[nextIdx] = []int{a, b}
			nextIdx++
		}
	}

	cc := make([][]int, len(components))
	i := 0
	for k := range components {
		cc[i] = components[k]
		i++
	}
	return cc
}

func main() {
	records := loadRecords(100)
	blocks := block(records)
	comparisons := 0
	edges := [][2]int{}
	for _, blockRecords := range blocks {
		c, e := compare(blockRecords)
		comparisons += c
		edges = append(edges, e...)
	}
	cc := connectedComponents(edges)

	fmt.Printf("made %d comparisons and found %d matches and %d entities\n", comparisons, len(edges), len(cc))
	for _, component := range cc {
		names := make([]string, len(component))
		for i, id := range component {
			names[i] = records[id].Name
		}
		fmt.Printf("found the following entity: %s from %s\n", strings.Join(names, ", "), records[component[0]].City)
	}
}
func block(records []Record) map[string][]Record {
	blocks := map[string][]Record{}
	for _, record := range records {
		blocks[record.City] = append(blocks[record.City], record)
	}
	return blocks
}

func main() {
	records := loadRecords(100)
	blocks := block(records)
	comparisons := 0
	matchCount := 0
	for _, blockRecords := range blocks {
		c, m := compare(blockRecords)
		comparisons += c
		matchCount += m
	}

	fmt.Printf("made %d comparisons and found %d matches\n", comparisons, matchCount)
}
Daisy Williams and Dave Williams are probably the same person
Deborax Browx and Debra Brown are probably the same person
Riley Brown and RxxeyxBrown are probably the same person
Dan Willxams and Dave Williams are probably the same person
made 9900 comparisons and found 16 matches
var firstNames = [...]string{"Wade", "Dave", "Seth", "Ivan", "Riley", "Gilbert", "Jorge", "Dan", "Brian", "Roberto", "Daisy", "Deborah", "Isabel", "Stella", "Debra", "Berverly", "Vera", "Angela", "Lucy", "Lauren"}
var lastNames = [...]string{"Smith", "Jones", "Williams", "Brown", "Taylor"}

func randomName() string {
	fn := firstNames[rand.Intn(len(firstNames))]
	ln := lastNames[rand.Intn(len(lastNames))]
	name := []byte(fmt.Sprintf("%s %s", fn, ln))
	errors := rand.Intn(4)
	for i := 0; i < errors; i++ {
		name[rand.Intn(len(name))] = 'x'
	}
	return string(name)
}

var cities = [...]string{"Paris", "Berlin", "New York", "Amsterdam", "Shanghai", "San Francisco", "Sydney", "Cape Town", "Brasilia", "Cairo"}

func randomCity() string {
	return cities[rand.Intn(len(cities))]
}

func loadRecords(n int) []Record {
	records := make([]Record, n)
	for i := 0; i < n; i++ {
		records[i] = Record{
			ID:   i,
			Name: randomName(),
			City: randomCity(),
		}
	}
	return records
}

func compare(records []Record) (comparisons, matchCount int) {
	for _, a := range records {
		for _, b := range records {
			if a == b {
				continue // don't compare with itself
			}
			comparisons++
			if matches(a, b) {
				fmt.Printf("%s and %s are probably the same person\n", a.Name, b.Name)
				matchCount++
			}
		}
	}
	return comparisons, matchCount
}

func main() {
	records := loadRecords(100)
	comparisons, matchCount := compare(records)

	fmt.Printf("made %d comparisons and found %d matches\n", comparisons, matchCount)
}
package main

import (
	"fmt"

	"github.com/hbollon/go-edlib"
)

type Record struct {
	ID int
	Name string
	City string
}

func matches(a, b Record) bool {
	distance := edlib.LevenshteinDistance(a.Name, b.Name)
	return distance <= 3 && a.City == b.City
}

func main() {
	a := Record{
		Name: "Vincent Van Gogh",
		City: "Paris",
	}
	b := Record{
		Name: "Vince Van Gough",
		City: "Paris",
	}
	if matches(a, b) {
		fmt.Printf("%s and %s are probably the same person\n", a.Name, b.Name)
	} else {
		fmt.Printf("%s and %s are probably not the same person\n", a.Name, b.Name)
	}
}
star

Fri Aug 11 2023 09:19:43 GMT+0000 (Coordinated Universal Time)

#entityresolution #fuzzymatching #connectedcomponents
star

Fri Aug 11 2023 09:16:23 GMT+0000 (Coordinated Universal Time)

#entityresolution #fuzzymatching #connectedcomponents
star

Fri Aug 11 2023 09:13:16 GMT+0000 (Coordinated Universal Time)

#entityresolution #fuzzymatching
star

Fri Aug 11 2023 09:10:13 GMT+0000 (Coordinated Universal Time)

#entityresolution #fuzzymatching
star

Fri Aug 11 2023 09:07:34 GMT+0000 (Coordinated Universal Time)

#entityresolution #fuzzymatching
star

Fri Aug 11 2023 09:00:29 GMT+0000 (Coordinated Universal Time)

#entityresolution #fuzzymatching

Save snippets that work with our extensions

Available in the Chrome Web Store Get Firefox Add-on Get VS Code extension