Files
hamkadr/src/JobsMedical.Web/Services/ListingParser.cs
T
soroush.asadi e6a796ab27
CI/CD / CI · dotnet build (push) Successful in 1m28s
CI/CD / Deploy · hamkadr (push) Successful in 2m24s
Match crawled listings to existing facilities (fuzzy) before creating new
When publishing a scraped listing we now look for a facility we already
have that is exactly or closely the same, and only create a new one when
there is no match — avoiding duplicates like «بیمارستان میلاد» vs «میلاد».

- ListingParser: extract a facility name (keyword + distinctive words) from
  the post and surface it in the parser notes.
- FacilityMatcher: Persian-aware normalization (ي/ك, ZWNJ, punctuation),
  type-word stripping for a "core" name, contains + Levenshtein similarity,
  and FindBest (same-city exact → any-city exact → same-city fuzzy → fuzzy).
- Review (manual publish): auto-select a matching facility or prefill the
  new-facility name; resolve-or-create uses fuzzy match; dropdown preselects.
- IngestionService (auto-publish): reuse FacilityMatcher against a run-wide
  facility list (grows as new ones are created) instead of exact-name only.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-08 07:14:48 +03:30

201 lines
11 KiB
C#
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using System.Text.RegularExpressions;
using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services;
/// <summary>Structured guess extracted from a raw channel post. All fields are best-effort.</summary>
public class ParsedListing
{
public ListingKind Kind { get; set; } = ListingKind.Shift;
public string? RoleName { get; set; }
public ShiftType? ShiftType { get; set; }
public EmploymentType? EmploymentType { get; set; }
public long? PayAmount { get; set; } // shift pay or single salary figure
public int? SharePercent { get; set; } // profit-share % (درصدی / سهم درآمد)
public bool PayNegotiable { get; set; }
public Gender Gender { get; set; } = Gender.Any; // جنسیت مورد نیاز
public string? CityName { get; set; }
public string? DistrictName { get; set; }
public string? FacilityName { get; set; } // hospital/clinic name guessed from the text
public string? Phone { get; set; }
public List<string> Notes { get; set; } = new(); // what was/wasn't detected (shown to admin)
}
/// <summary>
/// Turns a messy Persian channel/Divar post into a structured listing guess. This is the
/// Stage-1 implementation: transparent keyword + regex heuristics, no AI dependency (important
/// since LLM APIs are blocked from Iran). A future LlmListingParser can implement the same
/// interface and be swapped in via DI without touching the admin queue.
/// </summary>
public interface IListingParser
{
ParsedListing Parse(string rawText, IEnumerable<string> knownRoles,
IEnumerable<string> knownCities, IEnumerable<string> knownDistricts);
}
public class HeuristicListingParser : IListingParser
{
public ParsedListing Parse(string raw, IEnumerable<string> knownRoles,
IEnumerable<string> knownCities, IEnumerable<string> knownDistricts)
{
var p = new ParsedListing();
var text = Normalize(raw);
// --- Kind: shift vs hiring ---
bool jobSignals = ContainsAny(text, "استخدام", "جذب", "دعوت به همکاری", "تمام وقت", "تمام‌وقت", "قرارداد", "ماهانه", "حقوق ثابت");
bool shiftSignals = ContainsAny(text, "شیفت", "آنکال", "انکال", "نوبت", "کشیک");
p.Kind = (jobSignals && !shiftSignals) ? ListingKind.Job : ListingKind.Shift;
p.Notes.Add(p.Kind == ListingKind.Job ? "نوع: استخدام (تشخیص خودکار)" : "نوع: شیفت (تشخیص خودکار)");
// --- Role (longest match first so «پزشک متخصص» beats «پزشک») ---
foreach (var role in knownRoles.OrderByDescending(r => r.Length))
{
if (text.Contains(Normalize(role))) { p.RoleName = role; break; }
}
// Synonyms common on Divar/Medjobs → canonical seeded role names.
if (p.RoleName is null)
{
p.RoleName =
ContainsAny(text, "اتاق عمل", "اسکراب") ? "تکنسین اتاق عمل"
: ContainsAny(text, "فوریت", "اورژانس پیش بیمارستانی", "آمبولانس") ? "تکنسین فوریت‌های پزشکی"
: ContainsAny(text, "آزمایشگاه", "علوم آزمایشگاهی", "نمونه گیر") ? "کارشناس آزمایشگاه"
: ContainsAny(text, "بهیار", "کمک بهیار", "کمک پرستار", "بیماربر", "مراقب", "سالمند", "همراه بیمار", "تزریقات", "پانسمان") ? "پرستار"
: ContainsAny(text, "ماما", "مامایی") ? "ماما"
: ContainsAny(text, "فوق تخصص", "متخصص") ? "پزشک متخصص"
: ContainsAny(text, "پزشک", "دکتر", "طبیب") ? "پزشک عمومی"
: null;
}
p.Notes.Add(p.RoleName is null ? "نقش: تشخیص داده نشد" : $"نقش: {p.RoleName}");
// --- Shift type ---
if (ContainsAny(text, "آنکال", "انکال")) p.ShiftType = Models.ShiftType.OnCall;
else if (text.Contains("شب")) p.ShiftType = Models.ShiftType.Night;
else if (text.Contains("عصر")) p.ShiftType = Models.ShiftType.Evening;
else if (ContainsAny(text, "صبح", "روز")) p.ShiftType = Models.ShiftType.Day;
// --- Employment type ---
if (ContainsAny(text, "پاره وقت", "پاره‌وقت", "پارت تایم")) p.EmploymentType = Models.EmploymentType.PartTime;
else if (text.Contains("طرح")) p.EmploymentType = Models.EmploymentType.Plan;
else if (text.Contains("قرارداد")) p.EmploymentType = Models.EmploymentType.Contract;
else if (ContainsAny(text, "تمام وقت", "تمام‌وقت")) p.EmploymentType = Models.EmploymentType.FullTime;
// --- Gender requirement ---
if (ContainsAny(text, "خانم", "خانوم", "بانو", "زن ", "مامای")) p.Gender = Gender.Female;
else if (ContainsAny(text, "آقا", "اقا", "مرد ", "مرد،", "پسر")) p.Gender = Gender.Male;
if (p.Gender != Gender.Any)
p.Notes.Add($"جنسیت: {(p.Gender == Gender.Female ? "خانم" : "آقا")}");
// --- City / district ---
p.CityName = knownCities.FirstOrDefault(c => text.Contains(Normalize(c)));
p.DistrictName = knownDistricts.OrderByDescending(d => d.Length)
.FirstOrDefault(d => text.Contains(Normalize(d)));
// --- Profit share (درصدی / سهم) ---
var latinForShare = ToLatinDigits(text);
var share = Regex.Match(latinForShare, @"(\d{1,3})\s*(?:٪|%|درصد)");
if (!share.Success) share = Regex.Match(latinForShare, @"(?:٪|%)\s*(\d{1,3})");
if (share.Success && int.TryParse(share.Groups[1].Value, out var pct) && pct is > 0 and <= 100)
{ p.SharePercent = pct; p.Notes.Add($"سهم درآمد: {pct}٪"); }
else if (ContainsAny(text, "درصدی", "سهم درآمد", "شراکت", "پورسانت"))
{ p.Notes.Add("پرداخت درصدی/سهمی (درصد نامشخص)"); }
// --- Fixed pay ---
if (ContainsAny(text, "توافقی", "توافق")) { p.PayNegotiable = true; p.Notes.Add("حقوق: توافقی"); }
else
{
var amount = ExtractAmount(text);
if (amount is not null) { p.PayAmount = amount; p.Notes.Add($"حقوق تخمینی: {amount:#,0} تومان"); }
else if (p.SharePercent is null) p.Notes.Add("حقوق: تشخیص داده نشد");
}
// --- Facility name (بیمارستان/درمانگاه/کلینیک ... + the distinctive name) ---
p.FacilityName = ExtractFacilityName(text);
if (p.FacilityName is not null) p.Notes.Add($"مرکز: {p.FacilityName}");
// --- Phone ---
var phone = Regex.Match(ToLatinDigits(text), @"0?9\d{9}");
if (phone.Success) p.Phone = phone.Value;
return p;
}
// Words that introduce a facility name, longest/most-specific first.
private static readonly string[] FacilityKeywords =
{
"بیمارستان", "زایشگاه", "پلی کلینیک", "پلیکلینیک", "درمانگاه", "کلینیک",
"مرکز درمانی", "مرکز جراحی", "مجتمع پزشکی", "مجتمع درمانی", "مرکز", "مجتمع",
"آزمایشگاه", "مطب", "خانه سالمندان", "سرای سالمندان",
};
// Words that clearly aren't part of a facility's name — stop collecting here.
private static readonly string[] NameStops =
{
"جهت", "برای", "به", "با", "در", "از", "که", "نیاز", "نیازمند", "استخدام", "جذب",
"دعوت", "همکاری", "واقع", "آدرس", "تلفن", "شماره", "شیفت", "ساعت", "حقوق", "روز",
"شب", "صبح", "عصر", "می", "ها", "این", "یک", "محترم",
};
/// <summary>Best-effort hospital/clinic name: a facility keyword plus up to three name words.</summary>
private static string? ExtractFacilityName(string text)
{
foreach (var kw in FacilityKeywords)
{
var idx = text.IndexOf(kw, StringComparison.Ordinal);
if (idx < 0) continue;
var after = text[(idx + kw.Length)..];
var words = after.Split(
new[] { ' ', '\n', '\r', '\t', '،', ',', '.', '؛', ':', '(', ')', '-', '/', '«', '»', '"' },
StringSplitOptions.RemoveEmptyEntries);
var picked = new List<string>();
foreach (var w in words)
{
if (NameStops.Contains(w)) break;
if (Regex.IsMatch(w, @"\d")) break; // numbers/phones aren't names
if (w.Length == 1) break; // stray letters
picked.Add(w);
if (picked.Count >= 3) break;
}
if (picked.Count == 0) continue; // bare keyword (e.g. just «بیمارستان») isn't useful
return (kw + " " + string.Join(" ", picked)).Trim();
}
return null;
}
/// <summary>Pull a Toman figure out of free text, handling «میلیون» and Persian digits.</summary>
private static long? ExtractAmount(string text)
{
var latin = ToLatinDigits(text);
// e.g. "۲ میلیون" / "2.5 میلیون"
var million = Regex.Match(latin, @"(\d+(?:[.,]\d+)?)\s*میلیون");
if (million.Success && double.TryParse(million.Groups[1].Value.Replace(",", "."),
System.Globalization.NumberStyles.Any, System.Globalization.CultureInfo.InvariantCulture, out var m))
return (long)(m * 1_000_000);
// Otherwise the largest plain number that looks like money (>= 6 digits after removing separators).
long best = 0;
foreach (Match num in Regex.Matches(latin, @"[\d٬,،.]{6,}"))
{
var digits = Regex.Replace(num.Value, @"[^\d]", "");
if (digits.Length >= 6 && long.TryParse(digits, out var v) && v > best) best = v;
}
return best > 0 ? best : null;
}
private static string Normalize(string s) => s
.Replace('ي', 'ی').Replace('ك', 'ک').Replace('', ' ').Trim();
private static bool ContainsAny(string text, params string[] needles)
=> needles.Any(n => text.Contains(n));
private static string ToLatinDigits(string s)
{
var chars = s.ToCharArray();
for (var i = 0; i < chars.Length; i++)
{
if (chars[i] >= '۰' && chars[i] <= '۹') chars[i] = (char)('0' + (chars[i] - '۰'));
else if (chars[i] >= '٠' && chars[i] <= '٩') chars[i] = (char)('0' + (chars[i] - '٠'));
}
return new string(chars);
}
}