Files
hamkadr/src/JobsMedical.Web/Services/ListingParser.cs
T
soroush.asadi 88eca92333
CI/CD / CI · dotnet build (push) Successful in 1m51s
CI/CD / Deploy · hamkadr (push) Successful in 2m17s
Facility data hygiene: merge duplicates, drop junk-named facilities
Cleans up the crawl-generated facility table that surfaced garbage on /Facilities
(«بیمارستان هستم», «... از مدجابز», bare «کلینیک», «سازمان برنامه جنوبی» x3):

- FacilityMatcher.IsJunkName: shared detector for non-names — bare type words, cores
  made only of filler/verb tokens, and leaked crawl-source/placeholder text. Added
  داروخانه/آسایشگاه to the generic type words so bare ones are caught and dedupe better.
- HeuristicListingParser.ExtractFacilityName now rejects junk candidates (and emoji), so
  new ingests fall back to the shared placeholder instead of forging a fake facility.
- IngestionService.MergeAndCleanFacilitiesAsync (+ admin button): folds junk facilities
  into the placeholder and merges Persian-fuzzy duplicates into one keeper, repointing
  their shifts/jobs first. Hard guard: only purely crawl-generated, unmanaged facilities
  are removed — employer-owned and verified facilities are never touched.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-21 05:40:29 +03:30

428 lines
23 KiB
C#
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using System.Text.RegularExpressions;
using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services;
/// <summary>One contact channel pulled from a post (type + raw value).</summary>
public record ParsedContact(ContactType Type, string Value);
/// <summary>Structured guess extracted from a raw channel post. All fields are best-effort.</summary>
public class ParsedListing
{
public ListingKind Kind { get; set; } = ListingKind.Shift;
public string? RoleName { get; set; } // primary role (first match)
public List<string> RoleNames { get; set; } = new(); // all roles in the ad (e.g. سالمند + کودک)
public ShiftType? ShiftType { get; set; }
public EmploymentType? EmploymentType { get; set; }
public long? PayAmount { get; set; } // shift pay or single salary figure
public int? SharePercent { get; set; } // profit-share % (درصدی / سهم درآمد)
public bool PayNegotiable { get; set; }
public Gender Gender { get; set; } = Gender.Any; // جنسیت مورد نیاز
public string? CityName { get; set; }
public string? DistrictName { get; set; }
public string? FacilityName { get; set; } // hospital/clinic name guessed from the text
public string? Phone { get; set; }
// «آماده به کار» (talent) extras — populated when Kind == Talent.
public string? PersonName { get; set; } // «دکتر سپیده علیزاده»
public int? YearsExperience { get; set; } // سابقه (سال)
public bool IsLicensed { get; set; } // پروانه‌دار
public string? AreaNote { get; set; } // «فقط منطقه ۱»
public List<ParsedContact> Contacts { get; set; } = new(); // phones, email, socials…
public List<string> Tags { get; set; } = new(); // cert/skill keywords for search
public List<string> Notes { get; set; } = new(); // what was/wasn't detected (shown to admin)
}
/// <summary>
/// Turns a messy Persian channel/Divar post into a structured listing guess. This is the
/// Stage-1 implementation: transparent keyword + regex heuristics, no AI dependency (important
/// since LLM APIs are blocked from Iran). A future LlmListingParser can implement the same
/// interface and be swapped in via DI without touching the admin queue.
/// </summary>
public interface IListingParser
{
ParsedListing Parse(string rawText, IEnumerable<string> knownRoles,
IEnumerable<string> knownCities, IEnumerable<string> knownDistricts);
}
public class HeuristicListingParser : IListingParser
{
public ParsedListing Parse(string raw, IEnumerable<string> knownRoles,
IEnumerable<string> knownCities, IEnumerable<string> knownDistricts)
{
var p = new ParsedListing();
var text = Normalize(raw);
// --- Kind: talent (worker offers themselves) vs shift vs hiring ---
// Talent is checked first: «آماده به کار/همکاری», «جویای کار» mean the *person* is
// available — distinct from an employer's «دعوت به همکاری».
bool talentSignals = ContainsAny(text,
"آماده به کار", "آماده‌به‌کار", "آماده همکاری", "آماده‌ی همکاری", "آماده ی همکاری",
"آماده فعالیت", "جویای کار", "جویای کار هستم", "متقاضی کار", "نیازمند کار",
"آماده انجام", "می‌توانم همکاری", "میتوانم همکاری", "حاضر به همکاری");
bool jobSignals = ContainsAny(text, "استخدام", "جذب", "دعوت به همکاری", "نیازمندیم", "نیازمند است", "حقوق ثابت");
bool shiftSignals = ContainsAny(text, "شیفت", "آنکال", "انکال", "نوبت", "کشیک");
if (talentSignals)
{
p.Kind = ListingKind.Talent;
p.Notes.Add("نوع: آماده به کار (تشخیص خودکار)");
}
else
{
p.Kind = (jobSignals && !shiftSignals) ? ListingKind.Job : ListingKind.Shift;
p.Notes.Add(p.Kind == ListingKind.Job ? "نوع: استخدام (تشخیص خودکار)" : "نوع: شیفت (تشخیص خودکار)");
}
// --- Roles (an ad can name several at once: «پرستار سالمند و کودک و همراه بیمار») ---
var known = knownRoles.ToList();
var hits = new List<string>();
// Exact taxonomy matches (longest first so «پزشک متخصص» beats «پزشک»).
foreach (var role in known.OrderByDescending(r => r.Length))
if (text.Contains(Normalize(role))) hits.Add(role);
// Drop a role that's a substring of a longer matched role (پرستار ⊂ پرستار سالمندان).
hits = hits.Where(r => !hits.Any(o => o != r && o.Contains(r))).Distinct().ToList();
// Synonyms → canonical role names (covers terms not written verbatim). Only add a canonical
// that actually exists in the taxonomy, and isn't already a hit.
void AddSyn(string canonical, params string[] needles)
{
if (ContainsAny(text, needles) && known.Contains(canonical) && !hits.Contains(canonical))
hits.Add(canonical);
}
AddSyn("پرستار سالمندان", "سالمند", "سالمندان", "نگهداری سالمند");
AddSyn("دندانپزشک", "دندان", "دندانپزشک", "دندان‌پزشک");
AddSyn("تکنسین اتاق عمل", "اتاق عمل", "اسکراب");
AddSyn("تکنسین فوریت‌های پزشکی", "فوریت", "اورژانس پیش بیمارستانی", "آمبولانس");
AddSyn("کارشناس آزمایشگاه", "آزمایشگاه", "علوم آزمایشگاهی", "نمونه گیر");
AddSyn("ماما", "مامایی");
AddSyn("پرستار", "بهیار", "کمک بهیار", "کمک پرستار", "بیماربر", "مراقب", "همراه بیمار",
"کودک", "اطفال", "نوزاد", "تزریقات", "پانسمان");
AddSyn("پزشک متخصص", "فوق تخصص", "متخصص");
AddSyn("پزشک عمومی", "پزشک", "دکتر", "طبیب");
p.RoleNames = hits.Distinct().Take(4).ToList(); // cap fan-out
p.RoleName = p.RoleNames.FirstOrDefault();
p.Notes.Add(p.RoleNames.Count == 0 ? "نقش: تشخیص داده نشد" : $"نقش‌ها: {string.Join("، ", p.RoleNames)}");
// --- Shift type ---
if (ContainsAny(text, "آنکال", "انکال")) p.ShiftType = Models.ShiftType.OnCall;
else if (text.Contains("شب")) p.ShiftType = Models.ShiftType.Night;
else if (text.Contains("عصر")) p.ShiftType = Models.ShiftType.Evening;
else if (ContainsAny(text, "صبح", "روز")) p.ShiftType = Models.ShiftType.Day;
// --- Employment type ---
if (ContainsAny(text, "پاره وقت", "پاره‌وقت", "پارت تایم")) p.EmploymentType = Models.EmploymentType.PartTime;
else if (text.Contains("طرح")) p.EmploymentType = Models.EmploymentType.Plan;
else if (text.Contains("قرارداد")) p.EmploymentType = Models.EmploymentType.Contract;
else if (ContainsAny(text, "تمام وقت", "تمام‌وقت")) p.EmploymentType = Models.EmploymentType.FullTime;
// --- Gender requirement ---
if (ContainsAny(text, "خانم", "خانوم", "بانو", "زن ", "مامای")) p.Gender = Gender.Female;
else if (ContainsAny(text, "آقا", "اقا", "مرد ", "مرد،", "پسر")) p.Gender = Gender.Male;
if (p.Gender != Gender.Any)
p.Notes.Add($"جنسیت: {(p.Gender == Gender.Female ? "خانم" : "آقا")}");
// --- City / district ---
p.CityName = knownCities.FirstOrDefault(c => text.Contains(Normalize(c)));
p.DistrictName = knownDistricts.OrderByDescending(d => d.Length)
.FirstOrDefault(d => text.Contains(Normalize(d)));
// --- Profit share (درصدی / سهم) ---
var latinForShare = ToLatinDigits(text);
var share = Regex.Match(latinForShare, @"(\d{1,3})\s*(?:٪|%|درصد)");
if (!share.Success) share = Regex.Match(latinForShare, @"(?:٪|%)\s*(\d{1,3})");
if (share.Success && int.TryParse(share.Groups[1].Value, out var pct) && pct is > 0 and <= 100)
{ p.SharePercent = pct; p.Notes.Add($"سهم درآمد: {pct}٪"); }
else if (ContainsAny(text, "درصدی", "سهم درآمد", "شراکت", "پورسانت"))
{ p.Notes.Add("پرداخت درصدی/سهمی (درصد نامشخص)"); }
// --- Fixed pay (strip phone numbers first so they're never read as money) ---
if (ContainsAny(text, "توافقی", "توافق")) { p.PayNegotiable = true; p.Notes.Add("حقوق: توافقی"); }
else
{
var amount = ExtractAmount(StripPhones(text));
if (amount is not null) { p.PayAmount = amount; p.Notes.Add($"حقوق تخمینی: {amount:#,0} تومان"); }
else if (p.SharePercent is null) p.Notes.Add("حقوق: تشخیص داده نشد");
}
// --- Talent extras (only meaningful for «آماده به کار») ---
if (p.Kind == ListingKind.Talent)
{
var latinT = ToLatinDigits(text);
var exp = Regex.Match(latinT, @"سابقه[^\d]{0,8}(\d{1,2})\s*سال");
if (!exp.Success) exp = Regex.Match(latinT, @"(\d{1,2})\s*سال\s*سابقه");
if (exp.Success && int.TryParse(exp.Groups[1].Value, out var yrs) && yrs is > 0 and <= 60)
{ p.YearsExperience = yrs; p.Notes.Add($"سابقه: {yrs} سال"); }
p.IsLicensed = ContainsAny(text, "پروانه دار", "پروانه‌دار", "دارای پروانه", "پروانه فعالیت", "پروانه طبابت");
if (p.IsLicensed) p.Notes.Add("پروانه‌دار");
p.PersonName = ExtractPersonName(text);
if (p.PersonName is not null) p.Notes.Add($"نام: {p.PersonName}");
var area = Regex.Match(text, @"منطقه\s*[۰-۹0-9]{1,2}");
if (area.Success) { p.AreaNote = area.Value.Trim(); p.Notes.Add($"محدوده: {p.AreaNote}"); }
}
// --- Facility name (بیمارستان/درمانگاه/کلینیک ... + the distinctive name) ---
if (p.Kind != ListingKind.Talent)
{
p.FacilityName = ExtractFacilityName(text);
if (p.FacilityName is not null) p.Notes.Add($"مرکز: {p.FacilityName}");
}
// --- Tags (certs/skills for deep search): mmt, icu, پروانه‌دار, اتاق عمل … ---
p.Tags = ExtractTags(text);
if (p.RoleNames.Count > 0) p.Tags.AddRange(p.RoleNames);
if (p.IsLicensed && !p.Tags.Contains("پروانه‌دار")) p.Tags.Add("پروانه‌دار");
p.Tags = p.Tags.Distinct().ToList();
// --- Contacts (phones, email, socials — one ad may have several) ---
p.Contacts = ExtractContacts(raw ?? text);
p.Phone = p.Contacts.FirstOrDefault(c => c.Type is ContactType.Mobile or ContactType.Phone)?.Value;
if (p.Contacts.Count > 0)
p.Notes.Add("راه‌های ارتباطی: " + string.Join("، ", p.Contacts.Select(c => ContactLabel(c.Type))));
return p;
}
// Words that introduce a facility name, longest/most-specific first.
private static readonly string[] FacilityKeywords =
{
"بیمارستان", "زایشگاه", "پلی کلینیک", "پلیکلینیک", "درمانگاه", "کلینیک",
"مرکز درمانی", "مرکز جراحی", "مجتمع پزشکی", "مجتمع درمانی", "مرکز", "مجتمع",
"آزمایشگاه", "مطب", "خانه سالمندان", "سرای سالمندان",
};
// Words that clearly aren't part of a facility's name — stop collecting here.
private static readonly string[] NameStops =
{
"جهت", "برای", "به", "با", "در", "از", "که", "نیاز", "نیازمند", "استخدام", "جذب",
"دعوت", "همکاری", "واقع", "آدرس", "تلفن", "شماره", "شیفت", "ساعت", "حقوق", "روز",
"شب", "صبح", "عصر", "می", "ها", "این", "یک", "محترم",
};
/// <summary>Best-effort hospital/clinic name: a facility keyword plus up to three name words.</summary>
private static string? ExtractFacilityName(string text)
{
foreach (var kw in FacilityKeywords)
{
var idx = text.IndexOf(kw, StringComparison.Ordinal);
if (idx < 0) continue;
var after = text[(idx + kw.Length)..];
var words = after.Split(
new[] { ' ', '\n', '\r', '\t', '،', ',', '.', '؛', ':', '(', ')', '-', '/', '«', '»', '"' },
StringSplitOptions.RemoveEmptyEntries);
var picked = new List<string>();
foreach (var w in words)
{
if (NameStops.Contains(w)) break;
if (Regex.IsMatch(w, @"\d")) break; // numbers/phones aren't names
if (!w.Any(char.IsLetter)) break; // emoji / punctuation («📍») isn't a name
if (w.Length == 1) break; // stray letters
picked.Add(w);
if (picked.Count >= 3) break;
}
if (picked.Count == 0) continue; // bare keyword (e.g. just «بیمارستان») isn't useful
var candidate = (kw + " " + string.Join(" ", picked)).Trim();
// Reject names that are only filler/verb/source noise («بیمارستان هستم», «... از مدجابز») —
// a real name couldn't be extracted, so fall back to the shared placeholder downstream.
if (Scraping.FacilityMatcher.IsJunkName(candidate)) continue;
return candidate;
}
return null;
}
// Titles that introduce a person's name in «آماده به کار» posts.
private static readonly string[] PersonTitles = { "دکتر", "خانم دکتر", "آقای دکتر", "مهندس", "سرکار خانم", "جناب آقای", "خانم", "آقای" };
/// <summary>Best-effort person name: a title (دکتر/خانم/…) plus up to two following words.</summary>
private static string? ExtractPersonName(string text)
{
foreach (var title in PersonTitles)
{
var idx = text.IndexOf(title, StringComparison.Ordinal);
if (idx < 0) continue;
var after = text[(idx + title.Length)..];
var words = after.Split(
new[] { ' ', '\n', '\r', '\t', '،', ',', '.', '؛', ':', '(', ')', '-', '/' },
StringSplitOptions.RemoveEmptyEntries);
var picked = new List<string>();
foreach (var w in words)
{
if (NameStops.Contains(w)) break;
if (Regex.IsMatch(w, @"[\d]")) break;
if (w.Length == 1) break;
picked.Add(w);
if (picked.Count >= 2) break;
}
if (picked.Count == 0) continue;
return (title + " " + string.Join(" ", picked)).Trim();
}
return null;
}
/// <summary>Remove phone numbers (and «شماره تماس…» lines) so they're not mistaken for money.</summary>
private static string StripPhones(string text)
{
var t = Regex.Replace(text, @"شماره\s*(?:تماس|موبایل|همراه|ثابت|تلفن)[^\n]*", " ");
t = ToLatinDigits(t);
t = Regex.Replace(t, @"(?<!\d)(?:\+?98|0)?9\d{9}(?!\d)", " "); // mobile
t = Regex.Replace(t, @"(?<!\d)0\d{2,3}[\s-]?\d{7,8}(?!\d)", " "); // landline
return t;
}
/// <summary>Pull a figure out of free text and normalize to TOMAN (ریال → تومان = ÷۱۰),
/// handling «میلیون» and Persian digits.</summary>
private static long? ExtractAmount(string text)
{
var latin = ToLatinDigits(text);
bool hasToman = latin.Contains("تومان") || latin.Contains("تومن");
bool hasRial = (latin.Contains("ریال") || latin.Contains("ريال")) && !hasToman;
// e.g. "۲ میلیون" / "2.5 میلیون [ریال]"
var million = Regex.Match(latin, @"(\d+(?:[.,]\d+)?)\s*میلیون\s*(ریال|ريال)?");
if (million.Success && double.TryParse(million.Groups[1].Value.Replace(",", "."),
System.Globalization.NumberStyles.Any, System.Globalization.CultureInfo.InvariantCulture, out var m))
{
var val = (long)(m * 1_000_000);
if (million.Groups[2].Success) val /= 10; // «میلیون ریال»
return val;
}
// Largest plain number that looks like money (610 digits, no leading zero — a leading
// zero or 11+ digits means it's a phone/id). Convert ریال→تومان by the unit next to the
// number, else by the ad's overall currency.
long best = 0;
foreach (Match num in Regex.Matches(latin, @"(?<!\d)([1-9][\d٬,،.]{4,})\s*(ریال|ريال|تومان|تومن)?"))
{
var digits = Regex.Replace(num.Groups[1].Value, @"[^\d]", "");
if (digits.Length is < 6 or > 10 || !long.TryParse(digits, out var v)) continue;
var unit = num.Groups[2].Value;
bool isRial = unit is "ریال" or "ريال" || (unit.Length == 0 && hasRial);
if (isRial) v /= 10;
if (v > best) best = v;
}
// Sanity: a monthly figure of 200M+ تومان is implausible in Iran — if the ad never said
// «تومان», it was almost certainly ریال, so normalize.
if (best >= 200_000_000 && !hasToman) best /= 10;
return best > 0 ? best : null;
}
private static readonly Regex EmailRx = new(@"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}", RegexOptions.Compiled);
private static readonly Regex UrlRx = new(@"https?://[^\s]+", RegexOptions.Compiled);
private static string ContactLabel(ContactType t) => ContactInfo.Label(t);
/// <summary>Pull every contact channel out of a post: phones, email, and socials (Instagram /
/// Telegram / Bale / WhatsApp / website) via URLs and Persian keyword cues.</summary>
private static List<ParsedContact> ExtractContacts(string raw)
{
var latin = ToLatinDigits(raw);
var list = new List<ParsedContact>();
void Add(ContactType t, string v)
{
v = v.Trim().Trim('.', '،', ',', ')', '(', ':', '«', '»', '"', '/').Trim();
if (v.Length < 2) return;
if (!list.Any(c => c.Type == t && string.Equals(c.Value, v, StringComparison.OrdinalIgnoreCase)))
list.Add(new ParsedContact(t, v));
}
foreach (Match m in EmailRx.Matches(latin)) Add(ContactType.Email, m.Value);
foreach (Match m in UrlRx.Matches(latin))
{
var u = m.Value.TrimEnd('.', '،', ')', '(', '"');
var low = u.ToLowerInvariant();
if (low.Contains("instagram.com") || low.Contains("instagr.am")) Add(ContactType.Instagram, UrlHandle(u));
else if (low.Contains("t.me") || low.Contains("telegram.me")) Add(ContactType.Telegram, UrlHandle(u));
else if (low.Contains("ble.ir") || low.Contains("bale.ai")) Add(ContactType.Bale, UrlHandle(u));
else if (low.Contains("wa.me") || low.Contains("whatsapp")) Add(ContactType.WhatsApp, UrlHandle(u));
else Add(ContactType.Website, u);
}
// Persian keyword → handle (latin handles only, so Persian words after the cue don't match).
void Keyed(ContactType t, params string[] kws)
{
foreach (var kw in kws)
foreach (Match m in Regex.Matches(latin, kw + @"\s*[:]?\s*@?([A-Za-z0-9_.]{3,30})"))
Add(t, m.Groups[1].Value);
}
Keyed(ContactType.Instagram, "اینستاگرام", "اینستگرام", "اینستا", "پیج");
Keyed(ContactType.Telegram, "تلگرام");
Keyed(ContactType.WhatsApp, "واتساپ", "واتس اپ");
// phones — mobiles then landlines (multiple), boundary-guarded.
foreach (Match m in Regex.Matches(latin, @"(?<!\d)(?:\+?98|0)?9\d{9}(?!\d)"))
{
var d = Regex.Replace(m.Value, @"\D", "");
if (d.StartsWith("98")) d = "0" + d[2..];
if (d.Length == 10 && d[0] == '9') d = "0" + d;
Add(ContactType.Mobile, d);
}
foreach (Match m in Regex.Matches(latin, @"(?<!\d)0\d{2,3}[\s-]?\d{7,8}(?!\d)"))
Add(ContactType.Phone, Regex.Replace(m.Value, @"\D", ""));
return list.Take(8).ToList();
}
// Canonical tag → trigger words found in the post.
private static readonly (string Tag, string[] Needles)[] TagDict =
{
("mmt", new[] { "mmt", "ام ام تی", "ام‌ام‌تی" }),
("ICU", new[] { "icu", "آی سی یو", "آی‌سی‌یو" }),
("CCU", new[] { "ccu", "سی سی یو", "سی‌سی‌یو" }),
("NICU", new[] { "nicu", "ان آی سی یو", "نوزادان" }),
("BLS", new[] { "bls" }),
("ACLS", new[] { "acls" }),
("دیالیز", new[] { "دیالیز" }),
("اتاق عمل", new[] { "اتاق عمل", "اسکراب" }),
("بیهوشی", new[] { "بیهوشی" }),
("تریاژ", new[] { "تریاژ" }),
("تزریقات", new[] { "تزریقات", "تزریق" }),
("پانسمان", new[] { "پانسمان", "زخم" }),
("سونوگرافی", new[] { "سونوگرافی" }),
("رادیولوژی", new[] { "رادیولوژی" }),
("اورژانس", new[] { "اورژانس", "فوریت" }),
("مسئول فنی", new[] { "مسئول فنی" }),
("طرح", new[] { "طرح" }),
("سالمند", new[] { "سالمند" }),
("کودک", new[] { "کودک", "اطفال" }),
("همراه بیمار", new[] { "همراه بیمار" }),
("پروانه‌دار", new[] { "پروانه" }),
};
private static List<string> ExtractTags(string text)
{
var tags = new List<string>();
foreach (var (tag, needles) in TagDict)
if (ContainsAny(text, needles)) tags.Add(tag);
return tags;
}
private static string UrlHandle(string url)
{
var u = url.Split('?')[0].TrimEnd('/');
var seg = u.Contains('/') ? u[(u.LastIndexOf('/') + 1)..] : u;
return string.IsNullOrWhiteSpace(seg) ? url : seg;
}
private static string Normalize(string s) => s
.Replace('ي', 'ی').Replace('ك', 'ک').Replace('', ' ').Trim();
private static bool ContainsAny(string text, params string[] needles)
=> needles.Any(n => text.Contains(n));
private static string ToLatinDigits(string s)
{
var chars = s.ToCharArray();
for (var i = 0; i < chars.Length; i++)
{
if (chars[i] >= '۰' && chars[i] <= '۹') chars[i] = (char)('0' + (chars[i] - '۰'));
else if (chars[i] >= '٠' && chars[i] <= '٩') chars[i] = (char)('0' + (chars[i] - '٠'));
}
return new string(chars);
}
}