Naive Entity Resolution

PHOTO EMBED

Fri Aug 11 2023 09:07:34 GMT+0000 (Coordinated Universal Time)

Saved by @Tilores #entityresolution #fuzzymatching

var firstNames = [...]string{"Wade", "Dave", "Seth", "Ivan", "Riley", "Gilbert", "Jorge", "Dan", "Brian", "Roberto", "Daisy", "Deborah", "Isabel", "Stella", "Debra", "Berverly", "Vera", "Angela", "Lucy", "Lauren"}
var lastNames = [...]string{"Smith", "Jones", "Williams", "Brown", "Taylor"}

func randomName() string {
	fn := firstNames[rand.Intn(len(firstNames))]
	ln := lastNames[rand.Intn(len(lastNames))]
	name := []byte(fmt.Sprintf("%s %s", fn, ln))
	errors := rand.Intn(4)
	for i := 0; i < errors; i++ {
		name[rand.Intn(len(name))] = 'x'
	}
	return string(name)
}

var cities = [...]string{"Paris", "Berlin", "New York", "Amsterdam", "Shanghai", "San Francisco", "Sydney", "Cape Town", "Brasilia", "Cairo"}

func randomCity() string {
	return cities[rand.Intn(len(cities))]
}

func loadRecords(n int) []Record {
	records := make([]Record, n)
	for i := 0; i < n; i++ {
		records[i] = Record{
			ID:   i,
			Name: randomName(),
			City: randomCity(),
		}
	}
	return records
}

func compare(records []Record) (comparisons, matchCount int) {
	for _, a := range records {
		for _, b := range records {
			if a == b {
				continue // don't compare with itself
			}
			comparisons++
			if matches(a, b) {
				fmt.Printf("%s and %s are probably the same person\n", a.Name, b.Name)
				matchCount++
			}
		}
	}
	return comparisons, matchCount
}

func main() {
	records := loadRecords(100)
	comparisons, matchCount := compare(records)

	fmt.Printf("made %d comparisons and found %d matches\n", comparisons, matchCount)
}
content_copyCOPY