Files
hamkadr/src/JobsMedical.Web/Services/Scraping/IngestionService.cs
T
soroush.asadi 8d0a403b36
CI/CD / CI · dotnet build (push) Successful in 1m57s
CI/CD / Deploy · hamkadr (push) Successful in 1m9s
Near-duplicate applicant detection (collapse source reposts)
Exact ContentHash dedup misses the same ad reposted with slightly different text
(e.g. the ~18 repeated «کمک‌یار آقا»). DedupeTalentAsync collapses open aggregated
applicants by two high-precision signals — identical phone, or identical
(role, city, normalized description core with digits/«… پیش» time-phrases
stripped) — keeping the newest of each group. Runs at the end of both RunAsync
and ReprocessAsync; removed count surfaces in the run log.

Improvement 1 of the data-quality/SEO backlog.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-20 17:54:26 +03:30

673 lines
38 KiB
C#
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using System.Security.Cryptography;
using System.Text;
using System.Text.RegularExpressions;
using JobsMedical.Web.Data;
using JobsMedical.Web.Models;
using Microsoft.EntityFrameworkCore;
namespace JobsMedical.Web.Services.Scraping;
public record SourceResult(string Source, int Fetched, int Queued, int Published, int Flagged, int Spam, int Duplicates);
public record IngestionSummary(List<SourceResult> Sources)
{
public int TotalFetched => Sources.Sum(s => s.Fetched);
public int TotalQueued => Sources.Sum(s => s.Queued);
public int TotalPublished => Sources.Sum(s => s.Published);
public int TotalFlagged => Sources.Sum(s => s.Flagged);
public int TotalSpam => Sources.Sum(s => s.Spam);
public int TotalDuplicates => Sources.Sum(s => s.Duplicates);
}
/// <summary>
/// The scrape engine. For every enabled source: dedupe by content hash → parse → rule-validate →
/// (optional) AI audit → decide. Decision depends on admin settings:
/// • spam → Discarded
/// • AI on: AI verdict drives approve/reject/review; approve + Automatic + AiAutoApprove → publish
/// • AI off: Automatic + confidence ≥ threshold → publish; else queue/flag
/// "Publish" resolves-or-creates an (unverified) facility and creates the Shift/JobOpening.
/// </summary>
public class IngestionService
{
/// <summary>Applicant posts older than this (by the source's date, or a Persian "time ago"
/// phrase in the text) are skipped at ingest — availability goes stale fast.</summary>
private const int TalentMaxAgeDays = 7;
private readonly AppDbContext _db;
private readonly IEnumerable<IListingSource> _sources;
private readonly IListingParser _parser;
private readonly ListingValidator _validator;
private readonly IAiAuditor _ai;
private readonly SettingsService _settings;
private readonly ILogger<IngestionService> _log;
public IngestionService(AppDbContext db, IEnumerable<IListingSource> sources, IListingParser parser,
ListingValidator validator, IAiAuditor ai, SettingsService settings, ILogger<IngestionService> log)
{
_db = db; _sources = sources; _parser = parser; _validator = validator;
_ai = ai; _settings = settings; _log = log;
}
public IReadOnlyList<string> SourceNames => _sources.Select(s => s.Name).ToList();
/// <summary>Shared placeholder facility name for unnamed ads — kept identical to
/// Review.ResolveFacilityIdAsync so the auto-publish and manual-review flows reuse ONE record.</summary>
private const string UnknownFacilityName = "نامشخص / ثبت نشده";
public async Task<IngestionSummary> RunAsync(CancellationToken ct = default)
{
var settings = await _settings.GetAsync();
var roles = await _db.Roles.ToListAsync(ct);
var cities = await _db.Cities.ToListAsync(ct);
var districts = await _db.Districts.ToListAsync(ct);
var facilities = await _db.Facilities.ToListAsync(ct); // fuzzy-matched + grown as we create
var roleNames = roles.Select(r => r.Name).ToList();
var cityNames = cities.Select(c => c.Name).ToList();
var districtNames = districts.Select(d => d.Name).ToList();
var results = new List<SourceResult>();
foreach (var source in _sources)
{
int fetched = 0, queued = 0, published = 0, flagged = 0, spam = 0, dupes = 0;
IReadOnlyList<ScrapedItem> items;
try { items = await source.FetchAsync(settings, ct); }
catch (Exception ex) { _log.LogError(ex, "Source {Source} failed", source.Name); continue; }
if (items.Count == 0) continue; // disabled/unconfigured source
foreach (var item in items)
{
fetched++;
var hash = Hash(item.RawText);
var existing = await _db.RawListings.FirstOrDefaultAsync(r => r.ContentHash == hash, ct);
if (existing is not null)
{
// Best-effort geo retry: coords are normally captured only on first ingest, but a
// re-fetch may now expose a map center the first fetch lacked (Divar can fail-soft to
// null on a bad response / out-of-bbox). Backfill the cached row when this fetch has
// coords and the row has none, so an item still sitting in the queue can be placed on
// the map when an admin publishes it. (A full refresh is the purge-and-reingest flow.)
if (existing.Lat is null && item.Lat is not null) { existing.Lat = item.Lat; existing.Lng = item.Lng; }
dupes++; continue;
}
var parsed = _parser.Parse(item.RawText, roleNames, cityNames, districtNames);
var val = _validator.Validate(item.RawText, parsed);
// Drop STALE applicant («آماده به کار») posts — a person's availability goes cold fast.
// Age = the source's real timestamp, else a Persian "time ago" phrase in the text
// (Divar embeds «۲ هفته پیش»…). Recorded as Discarded (keeps the dedupe hash + audit
// trail; no AI spend). Shifts/jobs are NOT aged out — their dates are in the future.
if (parsed.Kind == ListingKind.Talent && PostAgeDays(item) is int age && age > TalentMaxAgeDays)
{
_db.RawListings.Add(new RawListing
{
SourceChannel = item.Source, SourceUrl = item.SourceUrl, RawText = item.RawText.Trim(),
ContentHash = hash, Confidence = 0, Status = RawListingStatus.Discarded,
ValidationNotes = $"آماده‌به‌کارِ قدیمی ({age} روز) — نادیده گرفته شد",
Lat = item.Lat, Lng = item.Lng,
});
spam++; continue;
}
AiAuditResult? ai = null;
if (settings.AiEnabled && !val.IsSpam)
ai = await _ai.AuditAsync(item.RawText, settings, ct);
var (status, reason, confidence) = Decide(settings, val, ai);
var raw = new RawListing
{
SourceChannel = item.Source,
SourceUrl = item.SourceUrl,
RawText = item.RawText.Trim(),
ContentHash = hash,
Confidence = confidence,
ValidationNotes = reason,
Status = status,
Lat = item.Lat, Lng = item.Lng, // approx. map coords (Divar) → facility on publish
};
_db.RawListings.Add(raw);
if (status == RawListingStatus.Normalized)
{
try { Publish(parsed, ai, raw, roles, cities, districts, facilities); published++; }
catch (Exception ex) { _log.LogWarning(ex, "Auto-publish failed; queueing instead"); raw.Status = RawListingStatus.New; queued++; }
}
else if (status == RawListingStatus.New) queued++;
else if (status == RawListingStatus.Flagged) flagged++;
else spam++;
}
await _db.SaveChangesAsync(ct);
results.Add(new SourceResult(source.Name, fetched, queued, published, flagged, spam, dupes));
_log.LogInformation("Ingest {S}: fetched={F} queued={Q} published={P} flagged={Fl} spam={Sp} dupes={D}",
source.Name, fetched, queued, published, flagged, spam, dupes);
}
var summary = new IngestionSummary(results);
await DedupeTalentAsync(ct); // collapse same-ad reposts the exact-hash dedup can't catch
// Persist a run-log row so admins get a crawl history (with a per-source breakdown).
if (results.Count > 0)
{
var detail = string.Join("؛ ", results.Select(r =>
$"{r.Source}: یافت {r.Fetched}، صف {r.Queued}، منتشر {r.Published}، پرچم {r.Flagged}، اسپم {r.Spam}، تکراری {r.Duplicates}"));
_db.IngestionRuns.Add(new IngestionRun
{
Fetched = summary.TotalFetched,
Queued = summary.TotalQueued,
Published = summary.TotalPublished,
Flagged = summary.TotalFlagged,
Spam = summary.TotalSpam,
Duplicates = summary.TotalDuplicates,
Detail = detail.Length > 2000 ? detail[..2000] : detail,
});
await _db.SaveChangesAsync(ct);
}
return summary;
}
/// <summary>
/// Re-run the CURRENT parser/AI/publish pipeline over every already-crawled RawListing, WITHOUT
/// re-fetching from sources. Use this after improving the pipeline to clean up existing aggregated
/// content (de-dupe, fix roles/categories/tags) — unlike <see cref="RunAsync"/> + the purge-cache
/// flow, it keeps every raw text, so nothing is lost to sources only exposing recent posts.
/// Deletes the old aggregated posts, then republishes from the stored raw text. Long-running
/// (one AI call per item) — call it on a background scope, not inside a request.
/// </summary>
/// <param name="talentOnly">SEO-safe default: only «آماده به کار» (which is NoIndex/Disallow) is
/// deleted &amp; rebuilt, so no INDEXED url changes. Shift/Job detail pages are indexed + in the
/// sitemap, so churning their IDs would 404 ranked pages — instead they self-clean via turnover.
/// Pass false only when you accept that SEO hit.</param>
public async Task<IngestionSummary> ReprocessAsync(bool talentOnly = true, CancellationToken ct = default)
{
var settings = await _settings.GetAsync();
var roles = await _db.Roles.ToListAsync(ct);
var cities = await _db.Cities.ToListAsync(ct);
var districts = await _db.Districts.ToListAsync(ct);
var facilities = await _db.Facilities.ToListAsync(ct); // reused (not deleted) → no facility churn
var roleNames = roles.Select(r => r.Name).ToList();
var cityNames = cities.Select(c => c.Name).ToList();
var districtNames = districts.Select(d => d.Name).ToList();
// Drop previously-published aggregated content; it's regenerated below from the raw text.
// DB cascade clears their ContactMethods/Applications/InterestEvents; RawListing back-refs SetNull.
await _db.TalentListings.Where(t => t.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct);
if (!talentOnly)
{
await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct);
await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct);
}
int fetched = 0, queued = 0, published = 0, flagged = 0, spam = 0;
var raws = await _db.RawListings.OrderBy(r => r.Id).ToListAsync(ct);
foreach (var raw in raws)
{
ct.ThrowIfCancellationRequested();
var parsed = _parser.Parse(raw.RawText, roleNames, cityNames, districtNames);
// SEO-safe scope: in talent-only mode, leave indexed shift/job listings (and their
// RawListing links/status) completely untouched — only applicants are rebuilt.
if (talentOnly && parsed.Kind != ListingKind.Talent) continue;
fetched++;
raw.LinkedTalentId = null; // talent rows were just deleted
if (!talentOnly) raw.LinkedShiftId = null;
var val = _validator.Validate(raw.RawText, parsed);
// Stale-applicant filter — age from the Persian "time ago" phrase in the text (Divar).
if (parsed.Kind == ListingKind.Talent
&& HtmlUtil.AgeDaysFromPersianText(raw.RawText) is int age && age > TalentMaxAgeDays)
{
raw.Status = RawListingStatus.Discarded; raw.Confidence = 0;
raw.ValidationNotes = $"آماده‌به‌کارِ قدیمی ({age} روز) — نادیده گرفته شد";
spam++; continue;
}
AiAuditResult? ai = null;
if (settings.AiEnabled && !val.IsSpam)
ai = await _ai.AuditAsync(raw.RawText, settings, ct);
var (status, reason, confidence) = Decide(settings, val, ai);
raw.Status = status; raw.ValidationNotes = reason; raw.Confidence = confidence;
if (status == RawListingStatus.Normalized)
{
try { Publish(parsed, ai, raw, roles, cities, districts, facilities); published++; }
catch (Exception ex) { _log.LogWarning(ex, "Reprocess publish failed; queueing"); raw.Status = RawListingStatus.New; queued++; }
}
else if (status == RawListingStatus.New) queued++;
else if (status == RawListingStatus.Flagged) flagged++;
else spam++;
if (fetched % 50 == 0) await _db.SaveChangesAsync(ct); // incremental progress on long runs
}
await _db.SaveChangesAsync(ct);
var deduped = await DedupeTalentAsync(ct); // collapse reposts the exact-hash dedup missed
_db.IngestionRuns.Add(new IngestionRun
{
Fetched = fetched, Queued = queued, Published = published, Flagged = flagged, Spam = spam, Duplicates = deduped,
Detail = $"پردازش مجدد آیتم‌های ذخیره‌شده — {fetched} آیتم: {published} منتشر، {queued} صف، {flagged} پرچم، {spam} ردشده/قدیمی، {deduped} تکراریِ حذف‌شده",
});
await _db.SaveChangesAsync(ct);
_log.LogInformation("Reprocess done: items={F} published={P} queued={Q} flagged={Fl} discarded={S} deduped={D}",
fetched, published, queued, flagged, spam, deduped);
return new IngestionSummary(new List<SourceResult>
{ new("پردازش مجدد", fetched, queued, published, flagged, spam, deduped) });
}
/// <summary>
/// Collapse near-duplicate aggregated APPLICANTS left when a source reposts the same ad (different
/// text → different ContentHash, so exact dedup missed them). Two high-precision signals: an
/// identical phone, or identical (role, city, normalized description core with digits/«… پیش»
/// time-phrases removed). Keeps the NEWEST of each group, deletes the rest. Returns the count removed.
/// </summary>
public async Task<int> DedupeTalentAsync(CancellationToken ct = default)
{
var rows = await _db.TalentListings
.Where(t => t.Status == ShiftStatus.Open && t.Source == ShiftSource.Aggregated)
.Select(t => new { t.Id, t.Phone, t.RoleId, t.CityId, t.Description, t.CreatedAt })
.ToListAsync(ct);
string? Sig(string? phone, int roleId, int cityId, string? desc)
{
var p = DigitsOnly(phone ?? "");
if (p.Length >= 7) return "p:" + p; // same number = same person/repost
var core = NormalizeFa(Regex.Replace(desc ?? "",
@"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim();
if (core.Length < 15) return null; // too little to call it a dup safely
return $"t:{roleId}:{cityId}:{(core.Length > 100 ? core[..100] : core)}";
}
var toRemove = rows
.Select(r => new { r.Id, r.CreatedAt, Key = Sig(r.Phone, r.RoleId, r.CityId, r.Description) })
.Where(x => x.Key is not null)
.GroupBy(x => x.Key)
.SelectMany(g => g.OrderByDescending(x => x.CreatedAt).Skip(1).Select(x => x.Id))
.ToList();
if (toRemove.Count == 0) return 0;
var removed = await _db.TalentListings.Where(t => toRemove.Contains(t.Id)).ExecuteDeleteAsync(ct);
_log.LogInformation("Deduped {N} near-duplicate applicants.", removed);
return removed;
}
private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray());
private static (RawListingStatus status, string? reason, int confidence) Decide(
AppSetting s, ValidationResult val, AiAuditResult? ai)
{
var notes = val.Issues.Count > 0 ? string.Join("؛ ", val.Issues) : null;
if (val.IsSpam)
return (RawListingStatus.Discarded, Join("اسپم", notes), val.Confidence);
if (ai is not null)
{
var aiNote = Join($"AI: {ai.Decision} ({ai.Confidence}٪)" + (ai.Reason is null ? "" : $" — {ai.Reason}"), notes);
if (ai.Reject) return (RawListingStatus.Discarded, aiNote, ai.Confidence);
if (ai.Approve)
{
// MEDICAL GATE: the rule-validator's medical signal vetoes an AI approval. The AI can
// hallucinate (e.g. approved a GeekVape product ad 95% as a «پرستار» job) — when our
// own keyword/role check sees nothing clinical, never auto-publish; send to review.
if (!val.LooksMedical)
return (RawListingStatus.Flagged, Join("هوش مصنوعی تأیید کرد ولی نشانهٔ کادر درمان یافت نشد — بررسی دستی", aiNote), ai.Confidence);
return (s.Mode == IngestionMode.Automatic && s.AiAutoApprove
? RawListingStatus.Normalized : RawListingStatus.New, aiNote, ai.Confidence);
}
return (RawListingStatus.Flagged, aiNote, ai.Confidence); // review
}
if (!val.IsValid) return (RawListingStatus.Flagged, notes, val.Confidence);
if (s.Mode == IngestionMode.Automatic && val.Confidence >= s.AutoPublishMinConfidence)
return (RawListingStatus.Normalized, notes, val.Confidence);
return (RawListingStatus.New, notes, val.Confidence);
}
private void Publish(ParsedListing parsed, AiAuditResult? ai, RawListing raw,
List<Role> roles, List<City> cities, List<District> districts, List<Facility> facilities)
{
var d = ai?.Data;
var cityName = d?.City ?? parsed.CityName;
var districtName = d?.District ?? parsed.DistrictName;
// One ad can name several roles («پرستار سالمند و کودک و همراه بیمار») — resolve them all
// and publish one listing per role so each is browsable/filterable. Capped to avoid spam.
// The AI's role (+ its category) is the trusted, possibly-new one; parser names are already
// canonical matches. Unknown roles are CREATED (dynamic taxonomy), not dropped.
var candidates = new List<(string name, string? category)>();
if (!string.IsNullOrWhiteSpace(d?.Role)) candidates.Add((d!.Role!.Trim(), d.Category));
foreach (var n in parsed.RoleNames) candidates.Add((n, null));
if (parsed.RoleName is not null) candidates.Add((parsed.RoleName, null));
var pubRoles = new List<Role>();
foreach (var (name, category) in candidates)
{
if (string.IsNullOrWhiteSpace(name)) continue;
var role = ResolveOrCreateRole(roles, name, category);
if (!pubRoles.Contains(role)) pubRoles.Add(role);
if (pubRoles.Count >= 4) break;
}
if (pubRoles.Count == 0) pubRoles.Add(roles.First());
var city = cities.FirstOrDefault(c => c.Name == cityName)
?? cities.FirstOrDefault(c => c.IsActive) ?? cities.First();
var district = districts.FirstOrDefault(x => x.Name == districtName && x.CityId == city.Id);
// Approx. coords for the map: the source ad's point (Divar) when present; otherwise, for a
// Tehran ad that only NAMES a neighborhood (Medjobs/Telegram), geocode that name to a rough
// center. Shown as a «محدودهٔ تقریبی» circle, never a precise pin.
double? appLat = raw.Lat, appLng = raw.Lng;
if (appLat is null && city.Name == "تهران"
&& TehranGeo.Locate(district?.Name, districtName, parsed.AreaNote) is { } g)
{ appLat = g.lat; appLng = g.lng; }
// Last resort — the AI model's inferred coords, but ONLY when they fall inside greater Tehran
// (rejects a hallucinated point elsewhere). Uses the registered model where the rules can't decide.
if (appLat is null && d?.Lat is double aLat && d?.Lng is double aLng && InTehran(aLat, aLng))
{ appLat = aLat; appLng = aLng; }
var kindStr = (d?.Kind ?? parsed.Kind.ToString()).ToLowerInvariant();
// «آماده به کار» — a worker offering themselves. No facility involved.
if (parsed.Kind == ListingKind.Talent || kindStr.Contains("talent") || kindStr.Contains("آماده"))
{
// ONE person = ONE listing. Do NOT fan out across roles: an applicant has a single
// profession, and «پرستار» + «پرستار کودک» from the same ad were producing duplicate
// cards. Use the primary (AI) role; any secondary role names become searchable tags.
var role = pubRoles[0];
var extraRoleTags = pubRoles.Skip(1).Select(r => r.Name);
var tPay = d?.PayAmount ?? parsed.PayAmount;
var tShare = d?.SharePercent ?? parsed.SharePercent;
_db.TalentListings.Add(new TalentListing
{
Role = role, City = city, DistrictId = district?.Id,
PersonName = !string.IsNullOrWhiteSpace(d?.PersonName) ? d!.PersonName!.Trim() : parsed.PersonName,
YearsExperience = d?.YearsExperience ?? parsed.YearsExperience,
IsLicensed = d?.IsLicensed ?? parsed.IsLicensed,
AreaNote = parsed.AreaNote,
Availability = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
Gender = parsed.Gender,
PayType = tShare is not null && tPay is null ? PayType.Percentage
: tPay is null ? PayType.Negotiable : PayType.PerShift,
PayAmount = tPay, SharePercent = tShare,
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone,
Description = raw.RawText,
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center
Contacts = BuildContacts(d, parsed),
Tags = BuildTags(parsed, d, role, city, extraRoleTags),
});
raw.Status = RawListingStatus.Normalized;
return;
}
// Never surface the crawl source (e.g. «مدجابز») in a public facility name. An unnamed ad
// falls back to ONE shared placeholder (same string as the manual-review flow, so both
// pipelines reuse a single record). That placeholder is shared by every unnamed ad in a
// city, so it must NEVER receive a single ad's fuzzy coords — that would mis-place dozens of
// unrelated listings on the map and in «near me». Mirrors Review.ResolveFacilityIdAsync.
bool unnamed = string.IsNullOrWhiteSpace(d?.FacilityName) && string.IsNullOrWhiteSpace(parsed.FacilityName);
var facilityName = !string.IsNullOrWhiteSpace(d?.FacilityName) ? d!.FacilityName!.Trim()
: !string.IsNullOrWhiteSpace(parsed.FacilityName) ? parsed.FacilityName!.Trim()
: UnknownFacilityName;
// Reuse an existing facility (exact or Persian-aware fuzzy match) before creating a new one.
var facility = FacilityMatcher.FindBest(facilities, facilityName, city.Id);
if (facility is null)
{
facility = new Facility
{
Name = facilityName, Type = FacilityType.Clinic, City = city, DistrictId = district?.Id,
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone, IsVerified = false,
Lat = unnamed ? null : raw.Lat, Lng = unnamed ? null : raw.Lng, // approx. Divar map center
};
_db.Facilities.Add(facility);
facilities.Add(facility); // so later listings in this run match it too
}
else if (!unnamed && facility.Lat is null && facility.Lng is null && raw.Lat is not null)
{
// Backfill coords only when the matched (real, named) facility has none — never overwrite a
// real (employer-set or verified) location with Divar's fuzzy point.
facility.Lat = raw.Lat; facility.Lng = raw.Lng;
}
if (kindStr.Contains("job") || kindStr.Contains("استخدام"))
{
foreach (var role in pubRoles)
_db.JobOpenings.Add(new JobOpening
{
Facility = facility, Role = role,
Title = !string.IsNullOrWhiteSpace(d?.Title) && pubRoles.Count == 1 ? d!.Title!.Trim() : $"استخدام {role.Name}",
EmploymentType = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
SalaryMin = parsed.PayAmount,
Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated,
SourceUrl = raw.SourceUrl,
Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center
Contacts = BuildContacts(d, parsed), // the ad's OWN number(s) — fresh per listing
});
}
else
{
var st = MapShiftType(d?.ShiftType, parsed.ShiftType);
var (start, end) = DefaultTimes(st);
foreach (var role in pubRoles)
_db.Shifts.Add(new Shift
{
Facility = facility, Role = role,
Date = DateOnly.FromDateTime(DateTime.UtcNow).AddDays(1),
StartTime = start, EndTime = end, ShiftType = st,
SpecialtyRequired = role.Name, Description = raw.RawText,
PayType = parsed.SharePercent is not null && parsed.PayAmount is null ? PayType.Percentage
: parsed.PayAmount is null ? PayType.Negotiable : PayType.PerShift,
PayAmount = parsed.PayAmount, SharePercent = parsed.SharePercent,
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center
Contacts = BuildContacts(d, parsed), // the ad's OWN number(s) — fresh per listing
});
}
raw.Status = RawListingStatus.Normalized;
}
/// <summary>Space-separated searchable tags: parsed cert/skill tags + AI-detected skills/requirements
/// + secondary role names + this listing's role/category + city. Pay/contact/location noise and
/// sentence fragments are filtered out so chips stay clinical. Drives deep search + tag chips.</summary>
private static string BuildTags(ParsedListing parsed, AiStructured? d, Role role, City city,
IEnumerable<string>? extraRoles = null)
{
var tags = new List<string>(parsed.Tags) { role.Name, role.Category, city.Name };
if (extraRoles is not null) tags.AddRange(extraRoles);
if (d?.Tags is not null)
tags.AddRange(d.Tags.Where(t => !string.IsNullOrWhiteSpace(t)).Select(t => t.Trim()));
return string.Join(" ", tags
.Where(t => !string.IsNullOrWhiteSpace(t) && !IsNoiseTag(t))
.Select(t => t.Trim())
.Distinct());
}
// Words/phrases that are NOT clinical skills — pay, contact, generic verbs, sentence fragments —
// that were polluting the tag chips («پرداخت توافقی»، «مراقبت از»…).
private static readonly string[] TagStopWords =
{
"توافقی", "پرداخت", "پرداخت توافقی", "حقوق", "دستمزد", "تماس", "شماره", "شماره تماس",
"مراقبت از", "مراقبت", "همکاری", "آماده", "آماده به کار", "نیرو", "استخدام", "جذب",
// personality / filler — not clinical skills
"خوش‌اخلاق", "خوش اخلاق", "خوشاخلاق", "دلسوز", "منظم", "مسئولیت‌پذیر", "مسئولیت پذیر", "باتجربه", "مجرب",
};
private static bool IsNoiseTag(string tag)
{
var t = NormalizeFa(tag);
if (t.Length < 2 || t.EndsWith(" از") || t.EndsWith("-از")) return true; // dangling «… از»
return TagStopWords.Any(w => NormalizeFa(w) == t);
}
/// <summary>Resolve a role name to an existing Role; if it's genuinely new, create it (dynamic
/// taxonomy). Matching is layered so a differently-worded-but-same-meaning role maps to the
/// canonical one instead of forking: (1) exact normalized name, (2) synonym/abbreviation alias
/// → canonical (دکتر→پزشک عمومی، نرس→پرستار…), (3) create. Only TRUE synonyms collapse — real
/// sub-specialties («پرستار ICU») stay distinct on purpose.</summary>
private Role ResolveOrCreateRole(List<Role> roles, string name, string? category)
{
// Drop gender/seniority modifiers baked into the role («پرستار آقا»→«پرستار»,
// «کارآموز تکنسین داروخانه»→«تکنسین داروخانه»). None of the real roles contain these tokens,
// so it only collapses sprawl — the modifier still lives on as a tag / the Gender field.
name = StripRoleModifiers(name);
var norm = NormalizeFa(name);
// (1) Already a known role (same word or spelling variant).
var match = roles.FirstOrDefault(r => NormalizeFa(r.Name) == norm);
if (match is not null) return match;
// (2) A synonym of a canonical role → use that role; don't create a duplicate.
if (RoleAliases.TryGetValue(norm, out var canonical))
{
var canonNorm = NormalizeFa(canonical);
var aliased = roles.FirstOrDefault(r => NormalizeFa(r.Name) == canonNorm);
if (aliased is not null) return aliased;
name = canonical; norm = canonNorm; // canonical not seeded yet → create under its proper name
}
// (3) Genuinely new role — create it under a canonical-resolved category.
var created = new Role
{
Name = Clamp(name.Trim(), 100), // respect Role.Name MaxLength(100)
Category = Clamp(ResolveCategory(category), 50), // closed set → respect MaxLength(50)
IsActive = true,
SortOrder = (roles.Count == 0 ? 0 : roles.Max(r => r.SortOrder)) + 1,
};
_db.Roles.Add(created);
roles.Add(created); // reuse within this run (saved with the batch at end of source)
_log.LogInformation("Ingestion introduced new role «{Role}» (category «{Category}») from AI.",
created.Name, created.Category);
return created;
}
/// <summary>Map an AI-suggested category to one of the FIXED groups (پزشک/پرستار/ماما/تکنسین/
/// دندانپزشک). Categories are a closed taxonomy — they drive the filter chips — so unlike roles
/// they are NEVER invented: a synonym resolves to its canonical group, anything else → «سایر».
/// (CategoryAliases maps each canonical group to itself, so exact matches resolve here too.)</summary>
private static string ResolveCategory(string? category)
=> CategoryAliases.TryGetValue(NormalizeFa(category), out var canonical) ? canonical : "سایر";
// Synonyms/abbreviations → canonical ROLE name, so the AI naming a role differently maps onto an
// existing role instead of forking the taxonomy. Keys are matched after NormalizeFa. Add freely.
private static readonly Dictionary<string, string> RoleAliases = BuildAliasMap(new()
{
["پزشک عمومی"] = new[] { "دکتر", "طبیب", "پزشک", "جی پی", "gp", "general practitioner" },
["پزشک متخصص"] = new[] { "متخصص", "فوق تخصص", "اسپشالیست", "specialist" },
["پرستار"] = new[] { "نرس", "nurse", "پرستاری", "کارشناس پرستاری" },
["پرستار سالمندان"] = new[] { "مراقب سالمند", "مراقب سالمندان", "پرستار سالمند", "نگهدار سالمند", "مراقبت سالمند" },
["ماما"] = new[] { "مامایی", "کارشناس مامایی", "midwife" },
["تکنسین اتاق عمل"] = new[] { "اتاق عمل", "اسکراب", "scrub", "تکنولوژیست اتاق عمل" },
["تکنسین فوریت‌های پزشکی"] = new[] { "فوریت پزشکی", "تکنسین اورژانس", "پارامدیک", "paramedic", "emt", "اورژانس ۱۱۵" },
["کارشناس آزمایشگاه"] = new[] { "علوم آزمایشگاهی", "تکنسین آزمایشگاه", "آزمایشگاهی", "لابراتوار", "lab", "laboratory" },
["دندانپزشک"] = new[] { "دندان پزشک", "دندون پزشک", "dentist" },
["کمک بهیار"] = new[] { "کمک‌یار", "کمکیار", "کمک یار", "کمک‌بهیار", "کمک بیمار" },
});
// Synonyms → canonical CATEGORY (the role-group used for filters/chips).
private static readonly Dictionary<string, string> CategoryAliases = BuildAliasMap(new()
{
["پزشک"] = new[] { "دکتر", "طبیب", "doctor", "پزشکی" },
["پرستار"] = new[] { "پرستاری", "nurse", "nursing" },
["ماما"] = new[] { "مامایی", "midwifery" },
["تکنسین"] = new[] { "تکنیسین", "تکنولوژیست", "technician", "کاردان فنی" },
["دندانپزشک"] = new[] { "دندان پزشک", "دندانپزشکی", "dental" },
});
/// <summary>Flatten {canonical → [synonyms]} into a {normalized synonym → canonical} lookup,
/// also mapping each canonical's own normalized form to itself.</summary>
private static Dictionary<string, string> BuildAliasMap(Dictionary<string, string[]> src)
{
var map = new Dictionary<string, string>();
foreach (var (canonical, aliases) in src)
{
map[NormalizeFa(canonical)] = canonical;
foreach (var a in aliases) map[NormalizeFa(a)] = canonical;
}
return map;
}
/// <summary>Normalize a Persian string for dedupe: unify Arabic/Persian ي→ی and ك→ک, drop ZWNJ,
/// collapse whitespace, trim, lowercase (so Latin tags like "ICU"/"icu" also match).</summary>
private static string NormalizeFa(string? s) => Regex.Replace(
(s ?? "").Replace('ي', 'ی').Replace('ك', 'ک').Replace('', ' ').Trim(),
@"\s+", " ").ToLowerInvariant();
private static string Clamp(string s, int max) => s.Length <= max ? s : s[..max].Trim();
/// <summary>Greater-Tehran bounding box — rejects out-of-area (hallucinated) AI coordinates.</summary>
private static bool InTehran(double lat, double lng) => lat is >= 35.4 and <= 35.95 && lng is >= 51.0 and <= 51.8;
// Gender/seniority tokens that don't belong in a role name (they go to tags / the Gender field).
private static readonly string[] RoleModifierWords =
{ "آقا", "خانم", "خانوم", "بانو", "مرد", "زن", "کارآموز", "کارورز", "ارشد", "مبتدی" };
/// <summary>Remove modifier tokens from a role name, keeping the base profession. Never strips to
/// empty (falls back to the original).</summary>
private static string StripRoleModifiers(string name)
{
var kept = NormalizeFa(name).Split(' ', StringSplitOptions.RemoveEmptyEntries)
.Where(t => !RoleModifierWords.Any(m => NormalizeFa(m) == t)).ToList();
return kept.Count > 0 ? string.Join(" ", kept) : name.Trim();
}
/// <summary>Fresh ContactMethod rows for one talent listing (parser contacts + AI phone).</summary>
private static List<ContactMethod> BuildContacts(AiStructured? d, ParsedListing parsed)
{
var contacts = parsed.Contacts
.Select((c, i) => new ContactMethod { Type = c.Type, Value = c.Value, SortOrder = i })
.ToList();
if (!string.IsNullOrWhiteSpace(d?.Phone)
&& !contacts.Any(c => c.Type is ContactType.Mobile or ContactType.Phone))
contacts.Insert(0, new ContactMethod { Type = ContactType.Mobile, Value = d!.Phone!.Trim(), SortOrder = -1 });
return contacts;
}
private static ShiftType MapShiftType(string? ai, ShiftType? parsed) => (ai?.ToLowerInvariant()) switch
{
"day" => ShiftType.Day, "evening" => ShiftType.Evening, "night" => ShiftType.Night, "oncall" => ShiftType.OnCall,
_ => parsed ?? ShiftType.Day,
};
private static EmploymentType MapEmployment(string? ai, EmploymentType? parsed) => (ai?.ToLowerInvariant()) switch
{
"parttime" => EmploymentType.PartTime, "contract" => EmploymentType.Contract,
"plan" => EmploymentType.Plan, "fulltime" => EmploymentType.FullTime,
_ => parsed ?? EmploymentType.FullTime,
};
private static (TimeOnly, TimeOnly) DefaultTimes(ShiftType t) => t switch
{
ShiftType.Day => (new TimeOnly(8, 0), new TimeOnly(14, 0)),
ShiftType.Evening => (new TimeOnly(14, 0), new TimeOnly(20, 0)),
ShiftType.Night => (new TimeOnly(20, 0), new TimeOnly(8, 0)),
_ => (new TimeOnly(8, 0), new TimeOnly(8, 0)),
};
private static string? Join(string a, string? b) => string.IsNullOrEmpty(b) ? a : $"{a} | {b}";
private static string Hash(string text)
{
var normalized = Regex.Replace((text ?? "").Trim(), @"\s+", " ");
return Convert.ToHexString(SHA256.HashData(Encoding.UTF8.GetBytes(normalized))).ToLowerInvariant();
}
/// <summary>Age of a post in whole days — from the source's real timestamp when present, else a
/// Persian "time ago" phrase in the text (Divar). Null when neither is available (= unknown age,
/// so it's NOT filtered out).</summary>
private static int? PostAgeDays(ScrapedItem item)
{
if (item.PostedAt is DateTime posted)
return Math.Max(0, (int)Math.Floor((DateTime.UtcNow - posted).TotalDays));
return HtmlUtil.AgeDaysFromPersianText(item.RawText);
}
}