Match crawled listings to existing facilities (fuzzy) before creating new
CI/CD / CI · dotnet build (push) Successful in 1m28s
CI/CD / Deploy · hamkadr (push) Successful in 2m24s

When publishing a scraped listing we now look for a facility we already
have that is exactly or closely the same, and only create a new one when
there is no match — avoiding duplicates like «بیمارستان میلاد» vs «میلاد».

- ListingParser: extract a facility name (keyword + distinctive words) from
  the post and surface it in the parser notes.
- FacilityMatcher: Persian-aware normalization (ي/ك, ZWNJ, punctuation),
  type-word stripping for a "core" name, contains + Levenshtein similarity,
  and FindBest (same-city exact → any-city exact → same-city fuzzy → fuzzy).
- Review (manual publish): auto-select a matching facility or prefill the
  new-facility name; resolve-or-create uses fuzzy match; dropdown preselects.
- IngestionService (auto-publish): reuse FacilityMatcher against a run-wide
  facility list (grows as new ones are created) instead of exact-name only.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-08 07:14:48 +03:30
parent a2fc70ae57
commit e6a796ab27
5 changed files with 191 additions and 10 deletions
@@ -51,10 +51,10 @@
<div class="filter-group">
<label>مرکز درمانی</label>
<select name="FacilityId">
<option value="0">— انتخاب نشده —</option>
<option value="0" selected="@(Model.FacilityId == 0)">— انتخاب نشده —</option>
@foreach (var f in Model.Facilities)
{
<option value="@f.Id">@f.Name — @f.City?.Name</option>
<option value="@f.Id" selected="@(Model.FacilityId == f.Id)">@f.Name — @f.City?.Name</option>
}
</select>
<input type="text" name="NewFacilityName" placeholder="یا نام مرکز جدید را وارد کن…" style="margin-top:6px;" />
@@ -1,6 +1,7 @@
using JobsMedical.Web.Data;
using JobsMedical.Web.Models;
using JobsMedical.Web.Services;
using JobsMedical.Web.Services.Scraping;
using Microsoft.AspNetCore.Authorization;
using Microsoft.AspNetCore.Mvc;
using Microsoft.AspNetCore.Mvc.RazorPages;
@@ -72,6 +73,25 @@ public class ReviewModel : PageModel
if (Parsed.PayAmount is not null) { PayAmount = Parsed.PayAmount; SalaryMin = Parsed.PayAmount; }
Description = Raw.RawText;
Title = Parsed.RoleName is not null ? $"استخدام {Parsed.RoleName}" : "موقعیت استخدامی";
// Facility: try to match the listing's facility to one we already have; otherwise
// prefill the "new facility" box so publishing creates it.
if (!string.IsNullOrWhiteSpace(Parsed.FacilityName))
{
var cityId = await _db.Cities.Where(c => c.Name == Parsed.CityName)
.Select(c => (int?)c.Id).FirstOrDefaultAsync();
var match = FacilityMatcher.FindBest(Facilities, Parsed.FacilityName, cityId);
if (match is not null)
{
FacilityId = match.Id;
Parsed.Notes.Add($"مرکز منطبق در سیستم: «{match.Name}» — همین انتخاب شد.");
}
else
{
NewFacilityName = Parsed.FacilityName;
Parsed.Notes.Add($"مرکز جدید پیشنهادی: «{Parsed.FacilityName}» — هنگام انتشار ساخته می‌شود.");
}
}
return Page();
}
@@ -181,15 +201,17 @@ public class ReviewModel : PageModel
if (string.IsNullOrWhiteSpace(NewFacilityName))
return null;
// Reuse a same-named facility if one already exists, else create it.
var name = NewFacilityName.Trim();
var existing = await _db.Facilities.FirstOrDefaultAsync(f => f.Name == name);
if (existing is not null) return existing.Id;
var cityId = await _db.Cities.OrderByDescending(c => c.IsActive)
.Select(c => (int?)c.Id).FirstOrDefaultAsync();
if (cityId is null) return null; // no cities seeded — cannot create a facility
// Reuse an existing facility that's exactly or closely the same (Persian-aware fuzzy
// match), so we don't create duplicates like «بیمارستان میلاد» vs «میلاد».
var all = await _db.Facilities.ToListAsync();
var match = FacilityMatcher.FindBest(all, name, cityId);
if (match is not null) return match.Id;
var facility = new Facility
{
Name = name,