Match crawled listings to existing facilities (fuzzy) before creating new
When publishing a scraped listing we now look for a facility we already have that is exactly or closely the same, and only create a new one when there is no match — avoiding duplicates like «بیمارستان میلاد» vs «میلاد». - ListingParser: extract a facility name (keyword + distinctive words) from the post and surface it in the parser notes. - FacilityMatcher: Persian-aware normalization (ي/ك, ZWNJ, punctuation), type-word stripping for a "core" name, contains + Levenshtein similarity, and FindBest (same-city exact → any-city exact → same-city fuzzy → fuzzy). - Review (manual publish): auto-select a matching facility or prefill the new-facility name; resolve-or-create uses fuzzy match; dropdown preselects. - IngestionService (auto-publish): reuse FacilityMatcher against a run-wide facility list (grows as new ones are created) instead of exact-name only. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,109 @@
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
using JobsMedical.Web.Models;
|
||||
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
|
||||
/// <summary>
|
||||
/// Persian-aware fuzzy matching for facility names, so the same hospital written slightly
|
||||
/// differently — spacing, ي/ك vs ی/ک, ZWNJ, with or without «بیمارستان» — resolves to one
|
||||
/// record instead of creating a duplicate. Used by both the manual review/publish flow and
|
||||
/// the auto-publish ingestion pipeline.
|
||||
/// </summary>
|
||||
public static class FacilityMatcher
|
||||
{
|
||||
// Generic type words stripped to compare the distinctive core of a name.
|
||||
private static readonly string[] TypeWords =
|
||||
{
|
||||
"بیمارستان", "زایشگاه", "پلی کلینیک", "پلیکلینیک", "درمانگاه", "کلینیک",
|
||||
"مرکز درمانی", "مرکز جراحی", "مجتمع پزشکی", "مجتمع درمانی", "مرکز", "مجتمع",
|
||||
"آزمایشگاه", "مطب", "تخصصی", "فوق تخصصی", "فوقتخصصی", "عمومی", "دکتر", "دی کلینیک",
|
||||
};
|
||||
|
||||
/// <summary>Lower-cased, Arabic→Persian folded, punctuation-stripped, whitespace-collapsed.</summary>
|
||||
public static string Normalize(string? s)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(s)) return "";
|
||||
var t = s.Replace('ي', 'ی').Replace('ك', 'ک').Replace('ۀ', 'ه').Replace('ة', 'ه')
|
||||
.Replace('أ', 'ا').Replace('إ', 'ا').Replace('آ', 'ا').Replace('ئ', 'ی')
|
||||
.Replace('', ' ').ToLowerInvariant();
|
||||
var sb = new StringBuilder(t.Length);
|
||||
foreach (var ch in t)
|
||||
sb.Append(char.IsLetterOrDigit(ch) || ch == ' ' ? ch : ' ');
|
||||
return Regex.Replace(sb.ToString(), @"\s+", " ").Trim();
|
||||
}
|
||||
|
||||
/// <summary>Normalized name with generic type words removed — the distinctive part.</summary>
|
||||
public static string Core(string? s)
|
||||
{
|
||||
var n = Normalize(s);
|
||||
if (n.Length == 0) return "";
|
||||
foreach (var w in TypeWords)
|
||||
{
|
||||
var nw = Normalize(w);
|
||||
if (nw.Length == 0) continue;
|
||||
n = Regex.Replace(n, $@"(?<![\p{{L}}\p{{N}}]){Regex.Escape(nw)}(?![\p{{L}}\p{{N}}])", " ");
|
||||
}
|
||||
return Regex.Replace(n, @"\s+", " ").Trim();
|
||||
}
|
||||
|
||||
/// <summary>True when two names almost certainly denote the same facility.</summary>
|
||||
public static bool IsSame(string? a, string? b)
|
||||
{
|
||||
var na = Normalize(a);
|
||||
var nb = Normalize(b);
|
||||
if (na.Length == 0 || nb.Length == 0) return false;
|
||||
if (na == nb) return true;
|
||||
|
||||
var ca = Core(a);
|
||||
var cb = Core(b);
|
||||
if (ca.Length >= 2 && ca == cb) return true;
|
||||
// one core fully contains the other (e.g. «میلاد» vs «میلاد ۱»)
|
||||
if (ca.Length >= 3 && cb.Length >= 3 && (ca.Contains(cb) || cb.Contains(ca))) return true;
|
||||
|
||||
// edit-distance similarity on the most informative basis
|
||||
var (x, y) = ca.Length >= 3 && cb.Length >= 3 ? (ca, cb) : (na, nb);
|
||||
return Similarity(x, y) >= 0.86;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Best existing facility for <paramref name="name"/>: same-city exact match first, then
|
||||
/// any-city exact, then same-city fuzzy, then any-city fuzzy. Null when nothing matches.
|
||||
/// </summary>
|
||||
public static Facility? FindBest(IEnumerable<Facility> facilities, string? name, int? cityId)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(name)) return null;
|
||||
var list = facilities as IList<Facility> ?? facilities.ToList();
|
||||
var target = Normalize(name);
|
||||
|
||||
return list.FirstOrDefault(f => cityId.HasValue && f.CityId == cityId && Normalize(f.Name) == target)
|
||||
?? list.FirstOrDefault(f => Normalize(f.Name) == target)
|
||||
?? list.FirstOrDefault(f => cityId.HasValue && f.CityId == cityId && IsSame(f.Name, name))
|
||||
?? list.FirstOrDefault(f => IsSame(f.Name, name));
|
||||
}
|
||||
|
||||
private static double Similarity(string a, string b)
|
||||
{
|
||||
if (a == b) return 1;
|
||||
var max = Math.Max(a.Length, b.Length);
|
||||
return max == 0 ? 1 : 1.0 - (double)Levenshtein(a, b) / max;
|
||||
}
|
||||
|
||||
private static int Levenshtein(string a, string b)
|
||||
{
|
||||
var dp = new int[b.Length + 1];
|
||||
for (var j = 0; j <= b.Length; j++) dp[j] = j;
|
||||
for (var i = 1; i <= a.Length; i++)
|
||||
{
|
||||
var prev = dp[0];
|
||||
dp[0] = i;
|
||||
for (var j = 1; j <= b.Length; j++)
|
||||
{
|
||||
var tmp = dp[j];
|
||||
dp[j] = Math.Min(Math.Min(dp[j] + 1, dp[j - 1] + 1), prev + (a[i - 1] == b[j - 1] ? 0 : 1));
|
||||
prev = tmp;
|
||||
}
|
||||
}
|
||||
return dp[b.Length];
|
||||
}
|
||||
}
|
||||
@@ -52,6 +52,7 @@ public class IngestionService
|
||||
var roles = await _db.Roles.ToListAsync(ct);
|
||||
var cities = await _db.Cities.ToListAsync(ct);
|
||||
var districts = await _db.Districts.ToListAsync(ct);
|
||||
var facilities = await _db.Facilities.ToListAsync(ct); // fuzzy-matched + grown as we create
|
||||
var roleNames = roles.Select(r => r.Name).ToList();
|
||||
var cityNames = cities.Select(c => c.Name).ToList();
|
||||
var districtNames = districts.Select(d => d.Name).ToList();
|
||||
@@ -95,7 +96,7 @@ public class IngestionService
|
||||
|
||||
if (status == RawListingStatus.Normalized)
|
||||
{
|
||||
try { Publish(parsed, ai, raw, roles, cities, districts); published++; }
|
||||
try { Publish(parsed, ai, raw, roles, cities, districts, facilities); published++; }
|
||||
catch (Exception ex) { _log.LogWarning(ex, "Auto-publish failed; queueing instead"); raw.Status = RawListingStatus.New; queued++; }
|
||||
}
|
||||
else if (status == RawListingStatus.New) queued++;
|
||||
@@ -157,7 +158,7 @@ public class IngestionService
|
||||
}
|
||||
|
||||
private void Publish(ParsedListing parsed, AiAuditResult? ai, RawListing raw,
|
||||
List<Role> roles, List<City> cities, List<District> districts)
|
||||
List<Role> roles, List<City> cities, List<District> districts, List<Facility> facilities)
|
||||
{
|
||||
var d = ai?.Data;
|
||||
var roleName = d?.Role ?? parsed.RoleName;
|
||||
@@ -170,9 +171,10 @@ public class IngestionService
|
||||
var district = districts.FirstOrDefault(x => x.Name == districtName && x.CityId == city.Id);
|
||||
|
||||
var facilityName = !string.IsNullOrWhiteSpace(d?.FacilityName) ? d!.FacilityName!.Trim()
|
||||
: !string.IsNullOrWhiteSpace(parsed.FacilityName) ? parsed.FacilityName!.Trim()
|
||||
: $"مرکز درمانی (از {raw.SourceChannel})";
|
||||
var facility = _db.Facilities.Local.FirstOrDefault(f => f.Name == facilityName && f.CityId == city.Id)
|
||||
?? _db.Facilities.FirstOrDefault(f => f.Name == facilityName && f.CityId == city.Id);
|
||||
// Reuse an existing facility (exact or Persian-aware fuzzy match) before creating a new one.
|
||||
var facility = FacilityMatcher.FindBest(facilities, facilityName, city.Id);
|
||||
if (facility is null)
|
||||
{
|
||||
facility = new Facility
|
||||
@@ -181,6 +183,7 @@ public class IngestionService
|
||||
Phone = parsed.Phone, IsVerified = false,
|
||||
};
|
||||
_db.Facilities.Add(facility);
|
||||
facilities.Add(facility); // so later listings in this run match it too
|
||||
}
|
||||
|
||||
var kind = (d?.Kind ?? parsed.Kind.ToString()).ToLowerInvariant();
|
||||
|
||||
Reference in New Issue
Block a user