func compare(records []Record) (comparisons int, edges [][2]int) {
for _, a := range records {
for _, b := range records {
if a == b {
continue // don't compare with itself
}
comparisons++
if matches(a, b) {
edges = append(edges, [2]int{a.ID, b.ID})
}
}
}
return comparisons, edges
}
func connectedComponents(edges [][2]int) [][]int {
components := map[int][]int{}
nextIdx := 0
idx := map[int]int{}
for _, edge := range edges {
a := edge[0]
b := edge[1]
aIdx, aOk := idx[a]
bIdx, bOk := idx[b]
switch {
case aOk && bOk && aIdx == bIdx: // in same component
continue
case aOk && bOk && aIdx != bIdx: // merge two components
components[nextIdx] = append(components[aIdx], components[bIdx]...)
delete(components, aIdx)
delete(components, bIdx)
for _, x := range components[nextIdx] {
idx[x] = nextIdx
}
nextIdx++
case aOk && !bOk: // add b to component of a
idx[b] = aIdx
components[aIdx] = append(components[aIdx], b)
case bOk && !aOk: // add a to component of b
idx[a] = bIdx
components[bIdx] = append(components[bIdx], a)
default: // create new component with a and b
idx[a] = nextIdx
idx[b] = nextIdx
components[nextIdx] = []int{a, b}
nextIdx++
}
}
cc := make([][]int, len(components))
i := 0
for k := range components {
cc[i] = components[k]
i++
}
return cc
}
func main() {
records := loadRecords(100)
blocks := block(records)
comparisons := 0
edges := [][2]int{}
for _, blockRecords := range blocks {
c, e := compare(blockRecords)
comparisons += c
edges = append(edges, e...)
}
cc := connectedComponents(edges)
fmt.Printf("made %d comparisons and found %d matches and %d entities\n", comparisons, len(edges), len(cc))
for _, component := range cc {
names := make([]string, len(component))
for i, id := range component {
names[i] = records[id].Name
}
fmt.Printf("found the following entity: %s from %s\n", strings.Join(names, ", "), records[component[0]].City)
}
}