2026-06-03 08:18:19 +03:30
|
|
|
|
using System.Security.Cryptography;
|
|
|
|
|
|
using System.Text;
|
|
|
|
|
|
using System.Text.RegularExpressions;
|
|
|
|
|
|
using JobsMedical.Web.Data;
|
|
|
|
|
|
using JobsMedical.Web.Models;
|
|
|
|
|
|
using Microsoft.EntityFrameworkCore;
|
|
|
|
|
|
|
|
|
|
|
|
namespace JobsMedical.Web.Services.Scraping;
|
|
|
|
|
|
|
2026-06-03 17:41:02 +03:30
|
|
|
|
public record SourceResult(string Source, int Fetched, int Queued, int Published, int Flagged, int Spam, int Duplicates);
|
2026-06-03 08:18:19 +03:30
|
|
|
|
|
|
|
|
|
|
public record IngestionSummary(List<SourceResult> Sources)
|
|
|
|
|
|
{
|
2026-06-08 06:23:58 +03:30
|
|
|
|
public int TotalFetched => Sources.Sum(s => s.Fetched);
|
2026-06-03 08:18:19 +03:30
|
|
|
|
public int TotalQueued => Sources.Sum(s => s.Queued);
|
2026-06-03 17:41:02 +03:30
|
|
|
|
public int TotalPublished => Sources.Sum(s => s.Published);
|
2026-06-03 08:18:19 +03:30
|
|
|
|
public int TotalFlagged => Sources.Sum(s => s.Flagged);
|
|
|
|
|
|
public int TotalSpam => Sources.Sum(s => s.Spam);
|
|
|
|
|
|
public int TotalDuplicates => Sources.Sum(s => s.Duplicates);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
2026-06-03 17:41:02 +03:30
|
|
|
|
/// The scrape engine. For every enabled source: dedupe by content hash → parse → rule-validate →
|
|
|
|
|
|
/// (optional) AI audit → decide. Decision depends on admin settings:
|
|
|
|
|
|
/// • spam → Discarded
|
|
|
|
|
|
/// • AI on: AI verdict drives approve/reject/review; approve + Automatic + AiAutoApprove → publish
|
|
|
|
|
|
/// • AI off: Automatic + confidence ≥ threshold → publish; else queue/flag
|
|
|
|
|
|
/// "Publish" resolves-or-creates an (unverified) facility and creates the Shift/JobOpening.
|
2026-06-03 08:18:19 +03:30
|
|
|
|
/// </summary>
|
|
|
|
|
|
public class IngestionService
|
|
|
|
|
|
{
|
2026-06-10 21:28:12 +03:30
|
|
|
|
/// <summary>Applicant posts older than this (by the source's date, or a Persian "time ago"
|
|
|
|
|
|
/// phrase in the text) are skipped at ingest — availability goes stale fast.</summary>
|
|
|
|
|
|
private const int TalentMaxAgeDays = 7;
|
|
|
|
|
|
|
2026-06-03 08:18:19 +03:30
|
|
|
|
private readonly AppDbContext _db;
|
|
|
|
|
|
private readonly IEnumerable<IListingSource> _sources;
|
|
|
|
|
|
private readonly IListingParser _parser;
|
|
|
|
|
|
private readonly ListingValidator _validator;
|
2026-06-03 17:41:02 +03:30
|
|
|
|
private readonly IAiAuditor _ai;
|
|
|
|
|
|
private readonly SettingsService _settings;
|
2026-06-03 08:18:19 +03:30
|
|
|
|
private readonly ILogger<IngestionService> _log;
|
|
|
|
|
|
|
2026-06-03 17:41:02 +03:30
|
|
|
|
public IngestionService(AppDbContext db, IEnumerable<IListingSource> sources, IListingParser parser,
|
|
|
|
|
|
ListingValidator validator, IAiAuditor ai, SettingsService settings, ILogger<IngestionService> log)
|
2026-06-03 08:18:19 +03:30
|
|
|
|
{
|
2026-06-03 17:41:02 +03:30
|
|
|
|
_db = db; _sources = sources; _parser = parser; _validator = validator;
|
|
|
|
|
|
_ai = ai; _settings = settings; _log = log;
|
2026-06-03 08:18:19 +03:30
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-04 00:44:11 +03:30
|
|
|
|
public IReadOnlyList<string> SourceNames => _sources.Select(s => s.Name).ToList();
|
2026-06-03 08:18:19 +03:30
|
|
|
|
|
2026-06-09 21:38:55 +03:30
|
|
|
|
/// <summary>Shared placeholder facility name for unnamed ads — kept identical to
|
|
|
|
|
|
/// Review.ResolveFacilityIdAsync so the auto-publish and manual-review flows reuse ONE record.</summary>
|
|
|
|
|
|
private const string UnknownFacilityName = "نامشخص / ثبت نشده";
|
|
|
|
|
|
|
2026-06-03 08:18:19 +03:30
|
|
|
|
public async Task<IngestionSummary> RunAsync(CancellationToken ct = default)
|
|
|
|
|
|
{
|
2026-06-03 17:41:02 +03:30
|
|
|
|
var settings = await _settings.GetAsync();
|
|
|
|
|
|
var roles = await _db.Roles.ToListAsync(ct);
|
|
|
|
|
|
var cities = await _db.Cities.ToListAsync(ct);
|
|
|
|
|
|
var districts = await _db.Districts.ToListAsync(ct);
|
2026-06-08 07:14:48 +03:30
|
|
|
|
var facilities = await _db.Facilities.ToListAsync(ct); // fuzzy-matched + grown as we create
|
2026-06-03 17:41:02 +03:30
|
|
|
|
var roleNames = roles.Select(r => r.Name).ToList();
|
|
|
|
|
|
var cityNames = cities.Select(c => c.Name).ToList();
|
|
|
|
|
|
var districtNames = districts.Select(d => d.Name).ToList();
|
2026-06-03 08:18:19 +03:30
|
|
|
|
|
|
|
|
|
|
var results = new List<SourceResult>();
|
|
|
|
|
|
|
2026-06-04 00:44:11 +03:30
|
|
|
|
foreach (var source in _sources)
|
2026-06-03 08:18:19 +03:30
|
|
|
|
{
|
2026-06-03 17:41:02 +03:30
|
|
|
|
int fetched = 0, queued = 0, published = 0, flagged = 0, spam = 0, dupes = 0;
|
2026-06-03 08:18:19 +03:30
|
|
|
|
IReadOnlyList<ScrapedItem> items;
|
2026-06-04 00:44:11 +03:30
|
|
|
|
try { items = await source.FetchAsync(settings, ct); }
|
2026-06-03 17:41:02 +03:30
|
|
|
|
catch (Exception ex) { _log.LogError(ex, "Source {Source} failed", source.Name); continue; }
|
2026-06-04 00:44:11 +03:30
|
|
|
|
if (items.Count == 0) continue; // disabled/unconfigured source
|
2026-06-03 08:18:19 +03:30
|
|
|
|
|
|
|
|
|
|
foreach (var item in items)
|
|
|
|
|
|
{
|
|
|
|
|
|
fetched++;
|
|
|
|
|
|
var hash = Hash(item.RawText);
|
2026-06-09 21:38:55 +03:30
|
|
|
|
var existing = await _db.RawListings.FirstOrDefaultAsync(r => r.ContentHash == hash, ct);
|
|
|
|
|
|
if (existing is not null)
|
|
|
|
|
|
{
|
|
|
|
|
|
// Best-effort geo retry: coords are normally captured only on first ingest, but a
|
|
|
|
|
|
// re-fetch may now expose a map center the first fetch lacked (Divar can fail-soft to
|
|
|
|
|
|
// null on a bad response / out-of-bbox). Backfill the cached row when this fetch has
|
|
|
|
|
|
// coords and the row has none, so an item still sitting in the queue can be placed on
|
|
|
|
|
|
// the map when an admin publishes it. (A full refresh is the purge-and-reingest flow.)
|
|
|
|
|
|
if (existing.Lat is null && item.Lat is not null) { existing.Lat = item.Lat; existing.Lng = item.Lng; }
|
|
|
|
|
|
dupes++; continue;
|
|
|
|
|
|
}
|
2026-06-03 08:18:19 +03:30
|
|
|
|
|
2026-06-03 17:41:02 +03:30
|
|
|
|
var parsed = _parser.Parse(item.RawText, roleNames, cityNames, districtNames);
|
2026-06-03 08:18:19 +03:30
|
|
|
|
var val = _validator.Validate(item.RawText, parsed);
|
|
|
|
|
|
|
2026-06-10 21:28:12 +03:30
|
|
|
|
// Drop STALE applicant («آماده به کار») posts — a person's availability goes cold fast.
|
|
|
|
|
|
// Age = the source's real timestamp, else a Persian "time ago" phrase in the text
|
|
|
|
|
|
// (Divar embeds «۲ هفته پیش»…). Recorded as Discarded (keeps the dedupe hash + audit
|
|
|
|
|
|
// trail; no AI spend). Shifts/jobs are NOT aged out — their dates are in the future.
|
|
|
|
|
|
if (parsed.Kind == ListingKind.Talent && PostAgeDays(item) is int age && age > TalentMaxAgeDays)
|
|
|
|
|
|
{
|
|
|
|
|
|
_db.RawListings.Add(new RawListing
|
|
|
|
|
|
{
|
|
|
|
|
|
SourceChannel = item.Source, SourceUrl = item.SourceUrl, RawText = item.RawText.Trim(),
|
|
|
|
|
|
ContentHash = hash, Confidence = 0, Status = RawListingStatus.Discarded,
|
|
|
|
|
|
ValidationNotes = $"آمادهبهکارِ قدیمی ({age} روز) — نادیده گرفته شد",
|
|
|
|
|
|
Lat = item.Lat, Lng = item.Lng,
|
|
|
|
|
|
});
|
|
|
|
|
|
spam++; continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-03 17:41:02 +03:30
|
|
|
|
AiAuditResult? ai = null;
|
|
|
|
|
|
if (settings.AiEnabled && !val.IsSpam)
|
|
|
|
|
|
ai = await _ai.AuditAsync(item.RawText, settings, ct);
|
|
|
|
|
|
|
|
|
|
|
|
var (status, reason, confidence) = Decide(settings, val, ai);
|
2026-06-03 08:18:19 +03:30
|
|
|
|
|
2026-06-03 17:41:02 +03:30
|
|
|
|
var raw = new RawListing
|
2026-06-03 08:18:19 +03:30
|
|
|
|
{
|
|
|
|
|
|
SourceChannel = item.Source,
|
|
|
|
|
|
SourceUrl = item.SourceUrl,
|
|
|
|
|
|
RawText = item.RawText.Trim(),
|
|
|
|
|
|
ContentHash = hash,
|
2026-06-03 17:41:02 +03:30
|
|
|
|
Confidence = confidence,
|
|
|
|
|
|
ValidationNotes = reason,
|
2026-06-03 08:18:19 +03:30
|
|
|
|
Status = status,
|
2026-06-09 21:38:55 +03:30
|
|
|
|
Lat = item.Lat, Lng = item.Lng, // approx. map coords (Divar) → facility on publish
|
2026-06-03 17:41:02 +03:30
|
|
|
|
};
|
|
|
|
|
|
_db.RawListings.Add(raw);
|
|
|
|
|
|
|
|
|
|
|
|
if (status == RawListingStatus.Normalized)
|
|
|
|
|
|
{
|
2026-06-08 07:14:48 +03:30
|
|
|
|
try { Publish(parsed, ai, raw, roles, cities, districts, facilities); published++; }
|
2026-06-03 17:41:02 +03:30
|
|
|
|
catch (Exception ex) { _log.LogWarning(ex, "Auto-publish failed; queueing instead"); raw.Status = RawListingStatus.New; queued++; }
|
|
|
|
|
|
}
|
|
|
|
|
|
else if (status == RawListingStatus.New) queued++;
|
|
|
|
|
|
else if (status == RawListingStatus.Flagged) flagged++;
|
|
|
|
|
|
else spam++;
|
2026-06-03 08:18:19 +03:30
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
await _db.SaveChangesAsync(ct);
|
2026-06-03 17:41:02 +03:30
|
|
|
|
results.Add(new SourceResult(source.Name, fetched, queued, published, flagged, spam, dupes));
|
|
|
|
|
|
_log.LogInformation("Ingest {S}: fetched={F} queued={Q} published={P} flagged={Fl} spam={Sp} dupes={D}",
|
|
|
|
|
|
source.Name, fetched, queued, published, flagged, spam, dupes);
|
2026-06-03 08:18:19 +03:30
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-08 06:23:58 +03:30
|
|
|
|
var summary = new IngestionSummary(results);
|
|
|
|
|
|
|
2026-06-20 17:54:26 +03:30
|
|
|
|
await DedupeTalentAsync(ct); // collapse same-ad reposts the exact-hash dedup can't catch
|
|
|
|
|
|
|
2026-06-21 13:19:11 +03:30
|
|
|
|
// Self-clean after every crawl so the board stays tidy with no manual admin clicks: archive
|
|
|
|
|
|
// out-of-scope/duplicate listings, merge duplicate + fold junk facilities, backfill coords.
|
|
|
|
|
|
var cleanup = results.Count > 0 ? await RunPostIngestCleanupAsync(ct) : default;
|
|
|
|
|
|
|
2026-06-08 06:23:58 +03:30
|
|
|
|
// Persist a run-log row so admins get a crawl history (with a per-source breakdown).
|
|
|
|
|
|
if (results.Count > 0)
|
|
|
|
|
|
{
|
|
|
|
|
|
var detail = string.Join("؛ ", results.Select(r =>
|
2026-06-21 13:19:11 +03:30
|
|
|
|
$"{r.Source}: یافت {r.Fetched}، صف {r.Queued}، منتشر {r.Published}، پرچم {r.Flagged}، اسپم {r.Spam}، تکراری {r.Duplicates}"))
|
|
|
|
|
|
+ $" || پاکسازیِ خودکار: {cleanup.archived} بایگانی، {cleanup.dedupedJobs} استخدامِ تکراری، {cleanup.mergedFac} مرکزِ ادغام، {cleanup.cleanedFac} مرکزِ حذف، {cleanup.coords} مختصات";
|
2026-06-08 06:23:58 +03:30
|
|
|
|
_db.IngestionRuns.Add(new IngestionRun
|
|
|
|
|
|
{
|
|
|
|
|
|
Fetched = summary.TotalFetched,
|
|
|
|
|
|
Queued = summary.TotalQueued,
|
|
|
|
|
|
Published = summary.TotalPublished,
|
|
|
|
|
|
Flagged = summary.TotalFlagged,
|
|
|
|
|
|
Spam = summary.TotalSpam,
|
|
|
|
|
|
Duplicates = summary.TotalDuplicates,
|
|
|
|
|
|
Detail = detail.Length > 2000 ? detail[..2000] : detail,
|
|
|
|
|
|
});
|
|
|
|
|
|
await _db.SaveChangesAsync(ct);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return summary;
|
2026-06-03 08:18:19 +03:30
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-20 14:24:20 +03:30
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// Re-run the CURRENT parser/AI/publish pipeline over every already-crawled RawListing, WITHOUT
|
|
|
|
|
|
/// re-fetching from sources. Use this after improving the pipeline to clean up existing aggregated
|
|
|
|
|
|
/// content (de-dupe, fix roles/categories/tags) — unlike <see cref="RunAsync"/> + the purge-cache
|
|
|
|
|
|
/// flow, it keeps every raw text, so nothing is lost to sources only exposing recent posts.
|
|
|
|
|
|
/// Deletes the old aggregated posts, then republishes from the stored raw text. Long-running
|
|
|
|
|
|
/// (one AI call per item) — call it on a background scope, not inside a request.
|
|
|
|
|
|
/// </summary>
|
2026-06-20 16:08:20 +03:30
|
|
|
|
/// <param name="talentOnly">SEO-safe default: only «آماده به کار» (which is NoIndex/Disallow) is
|
|
|
|
|
|
/// deleted & rebuilt, so no INDEXED url changes. Shift/Job detail pages are indexed + in the
|
|
|
|
|
|
/// sitemap, so churning their IDs would 404 ranked pages — instead they self-clean via turnover.
|
|
|
|
|
|
/// Pass false only when you accept that SEO hit.</param>
|
|
|
|
|
|
public async Task<IngestionSummary> ReprocessAsync(bool talentOnly = true, CancellationToken ct = default)
|
2026-06-20 14:24:20 +03:30
|
|
|
|
{
|
|
|
|
|
|
var settings = await _settings.GetAsync();
|
|
|
|
|
|
var roles = await _db.Roles.ToListAsync(ct);
|
|
|
|
|
|
var cities = await _db.Cities.ToListAsync(ct);
|
|
|
|
|
|
var districts = await _db.Districts.ToListAsync(ct);
|
|
|
|
|
|
var facilities = await _db.Facilities.ToListAsync(ct); // reused (not deleted) → no facility churn
|
|
|
|
|
|
var roleNames = roles.Select(r => r.Name).ToList();
|
|
|
|
|
|
var cityNames = cities.Select(c => c.Name).ToList();
|
|
|
|
|
|
var districtNames = districts.Select(d => d.Name).ToList();
|
|
|
|
|
|
|
|
|
|
|
|
// Drop previously-published aggregated content; it's regenerated below from the raw text.
|
|
|
|
|
|
// DB cascade clears their ContactMethods/Applications/InterestEvents; RawListing back-refs SetNull.
|
|
|
|
|
|
await _db.TalentListings.Where(t => t.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct);
|
2026-06-20 16:08:20 +03:30
|
|
|
|
if (!talentOnly)
|
|
|
|
|
|
{
|
|
|
|
|
|
await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct);
|
|
|
|
|
|
await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct);
|
|
|
|
|
|
}
|
2026-06-20 14:24:20 +03:30
|
|
|
|
|
|
|
|
|
|
int fetched = 0, queued = 0, published = 0, flagged = 0, spam = 0;
|
|
|
|
|
|
var raws = await _db.RawListings.OrderBy(r => r.Id).ToListAsync(ct);
|
|
|
|
|
|
foreach (var raw in raws)
|
|
|
|
|
|
{
|
|
|
|
|
|
ct.ThrowIfCancellationRequested();
|
2026-06-20 16:08:20 +03:30
|
|
|
|
var parsed = _parser.Parse(raw.RawText, roleNames, cityNames, districtNames);
|
|
|
|
|
|
|
|
|
|
|
|
// SEO-safe scope: in talent-only mode, leave indexed shift/job listings (and their
|
|
|
|
|
|
// RawListing links/status) completely untouched — only applicants are rebuilt.
|
|
|
|
|
|
if (talentOnly && parsed.Kind != ListingKind.Talent) continue;
|
|
|
|
|
|
|
2026-06-20 14:24:20 +03:30
|
|
|
|
fetched++;
|
2026-06-20 16:08:20 +03:30
|
|
|
|
raw.LinkedTalentId = null; // talent rows were just deleted
|
|
|
|
|
|
if (!talentOnly) raw.LinkedShiftId = null;
|
2026-06-20 14:24:20 +03:30
|
|
|
|
|
|
|
|
|
|
var val = _validator.Validate(raw.RawText, parsed);
|
|
|
|
|
|
|
|
|
|
|
|
// Stale-applicant filter — age from the Persian "time ago" phrase in the text (Divar).
|
|
|
|
|
|
if (parsed.Kind == ListingKind.Talent
|
|
|
|
|
|
&& HtmlUtil.AgeDaysFromPersianText(raw.RawText) is int age && age > TalentMaxAgeDays)
|
|
|
|
|
|
{
|
|
|
|
|
|
raw.Status = RawListingStatus.Discarded; raw.Confidence = 0;
|
|
|
|
|
|
raw.ValidationNotes = $"آمادهبهکارِ قدیمی ({age} روز) — نادیده گرفته شد";
|
|
|
|
|
|
spam++; continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
AiAuditResult? ai = null;
|
|
|
|
|
|
if (settings.AiEnabled && !val.IsSpam)
|
|
|
|
|
|
ai = await _ai.AuditAsync(raw.RawText, settings, ct);
|
|
|
|
|
|
|
|
|
|
|
|
var (status, reason, confidence) = Decide(settings, val, ai);
|
|
|
|
|
|
raw.Status = status; raw.ValidationNotes = reason; raw.Confidence = confidence;
|
|
|
|
|
|
|
|
|
|
|
|
if (status == RawListingStatus.Normalized)
|
|
|
|
|
|
{
|
|
|
|
|
|
try { Publish(parsed, ai, raw, roles, cities, districts, facilities); published++; }
|
|
|
|
|
|
catch (Exception ex) { _log.LogWarning(ex, "Reprocess publish failed; queueing"); raw.Status = RawListingStatus.New; queued++; }
|
|
|
|
|
|
}
|
|
|
|
|
|
else if (status == RawListingStatus.New) queued++;
|
|
|
|
|
|
else if (status == RawListingStatus.Flagged) flagged++;
|
|
|
|
|
|
else spam++;
|
|
|
|
|
|
|
|
|
|
|
|
if (fetched % 50 == 0) await _db.SaveChangesAsync(ct); // incremental progress on long runs
|
|
|
|
|
|
}
|
|
|
|
|
|
await _db.SaveChangesAsync(ct);
|
|
|
|
|
|
|
2026-06-20 17:54:26 +03:30
|
|
|
|
var deduped = await DedupeTalentAsync(ct); // collapse reposts the exact-hash dedup missed
|
|
|
|
|
|
|
2026-06-20 14:24:20 +03:30
|
|
|
|
_db.IngestionRuns.Add(new IngestionRun
|
|
|
|
|
|
{
|
2026-06-20 17:54:26 +03:30
|
|
|
|
Fetched = fetched, Queued = queued, Published = published, Flagged = flagged, Spam = spam, Duplicates = deduped,
|
|
|
|
|
|
Detail = $"پردازش مجدد آیتمهای ذخیرهشده — {fetched} آیتم: {published} منتشر، {queued} صف، {flagged} پرچم، {spam} ردشده/قدیمی، {deduped} تکراریِ حذفشده",
|
2026-06-20 14:24:20 +03:30
|
|
|
|
});
|
|
|
|
|
|
await _db.SaveChangesAsync(ct);
|
2026-06-20 17:54:26 +03:30
|
|
|
|
_log.LogInformation("Reprocess done: items={F} published={P} queued={Q} flagged={Fl} discarded={S} deduped={D}",
|
|
|
|
|
|
fetched, published, queued, flagged, spam, deduped);
|
2026-06-20 14:24:20 +03:30
|
|
|
|
|
|
|
|
|
|
return new IngestionSummary(new List<SourceResult>
|
2026-06-20 17:54:26 +03:30
|
|
|
|
{ new("پردازش مجدد", fetched, queued, published, flagged, spam, deduped) });
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// Collapse near-duplicate aggregated APPLICANTS left when a source reposts the same ad (different
|
|
|
|
|
|
/// text → different ContentHash, so exact dedup missed them). Two high-precision signals: an
|
|
|
|
|
|
/// identical phone, or identical (role, city, normalized description core with digits/«… پیش»
|
|
|
|
|
|
/// time-phrases removed). Keeps the NEWEST of each group, deletes the rest. Returns the count removed.
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
public async Task<int> DedupeTalentAsync(CancellationToken ct = default)
|
|
|
|
|
|
{
|
|
|
|
|
|
var rows = await _db.TalentListings
|
|
|
|
|
|
.Where(t => t.Status == ShiftStatus.Open && t.Source == ShiftSource.Aggregated)
|
|
|
|
|
|
.Select(t => new { t.Id, t.Phone, t.RoleId, t.CityId, t.Description, t.CreatedAt })
|
|
|
|
|
|
.ToListAsync(ct);
|
|
|
|
|
|
|
|
|
|
|
|
string? Sig(string? phone, int roleId, int cityId, string? desc)
|
|
|
|
|
|
{
|
|
|
|
|
|
var p = DigitsOnly(phone ?? "");
|
|
|
|
|
|
if (p.Length >= 7) return "p:" + p; // same number = same person/repost
|
|
|
|
|
|
var core = NormalizeFa(Regex.Replace(desc ?? "",
|
|
|
|
|
|
@"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim();
|
|
|
|
|
|
if (core.Length < 15) return null; // too little to call it a dup safely
|
|
|
|
|
|
return $"t:{roleId}:{cityId}:{(core.Length > 100 ? core[..100] : core)}";
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
var toRemove = rows
|
|
|
|
|
|
.Select(r => new { r.Id, r.CreatedAt, Key = Sig(r.Phone, r.RoleId, r.CityId, r.Description) })
|
|
|
|
|
|
.Where(x => x.Key is not null)
|
|
|
|
|
|
.GroupBy(x => x.Key)
|
|
|
|
|
|
.SelectMany(g => g.OrderByDescending(x => x.CreatedAt).Skip(1).Select(x => x.Id))
|
|
|
|
|
|
.ToList();
|
|
|
|
|
|
|
|
|
|
|
|
if (toRemove.Count == 0) return 0;
|
|
|
|
|
|
var removed = await _db.TalentListings.Where(t => toRemove.Contains(t.Id)).ExecuteDeleteAsync(ct);
|
|
|
|
|
|
_log.LogInformation("Deduped {N} near-duplicate applicants.", removed);
|
|
|
|
|
|
return removed;
|
2026-06-20 14:24:20 +03:30
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-21 05:09:39 +03:30
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// In-place geocoding backfill: for existing AGGREGATED listings in Tehran that still have no map
|
|
|
|
|
|
/// coords, derive an APPROXIMATE neighbourhood center from the stored ad text (TehranGeo) and fill
|
|
|
|
|
|
/// Lat/Lng. Unlike <see cref="ReprocessAsync"/> it never deletes or recreates rows, so listing IDs —
|
|
|
|
|
|
/// and the indexed shift/job URLs in the sitemap — are untouched; safe to run on the live board.
|
|
|
|
|
|
/// Only ever FILLS a null coordinate; a real point (Divar/employer/AI) is never overwritten.
|
|
|
|
|
|
/// Returns how many listings were newly placed on the map.
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
public async Task<int> BackfillCoordsAsync(CancellationToken ct = default)
|
|
|
|
|
|
{
|
|
|
|
|
|
var tehran = await _db.Cities.FirstOrDefaultAsync(c => c.Name == "تهران", ct);
|
|
|
|
|
|
if (tehran is null) return 0;
|
|
|
|
|
|
int filled = 0;
|
|
|
|
|
|
|
|
|
|
|
|
var jobs = await _db.JobOpenings
|
2026-06-21 05:25:51 +03:30
|
|
|
|
.Where(j => j.Status == ShiftStatus.Open && j.Lat == null && j.Source == ShiftSource.Aggregated && j.Facility.CityId == tehran.Id)
|
2026-06-21 05:09:39 +03:30
|
|
|
|
.ToListAsync(ct);
|
|
|
|
|
|
foreach (var j in jobs)
|
|
|
|
|
|
if (TehranGeo.Locate(j.Description) is { } g) { j.Lat = g.lat; j.Lng = g.lng; filled++; }
|
|
|
|
|
|
|
|
|
|
|
|
var shifts = await _db.Shifts
|
2026-06-21 05:25:51 +03:30
|
|
|
|
.Where(s => s.Status == ShiftStatus.Open && s.Lat == null && s.Source == ShiftSource.Aggregated && s.Facility.CityId == tehran.Id)
|
2026-06-21 05:09:39 +03:30
|
|
|
|
.ToListAsync(ct);
|
|
|
|
|
|
foreach (var s in shifts)
|
|
|
|
|
|
if (TehranGeo.Locate(s.Description) is { } g) { s.Lat = g.lat; s.Lng = g.lng; filled++; }
|
|
|
|
|
|
|
|
|
|
|
|
var talent = await _db.TalentListings
|
2026-06-21 05:25:51 +03:30
|
|
|
|
.Where(t => t.Status == ShiftStatus.Open && t.Lat == null && t.Source == ShiftSource.Aggregated && t.CityId == tehran.Id)
|
2026-06-21 05:09:39 +03:30
|
|
|
|
.ToListAsync(ct);
|
|
|
|
|
|
foreach (var t in talent)
|
|
|
|
|
|
if (TehranGeo.Locate(t.AreaNote, t.Description) is { } g) { t.Lat = g.lat; t.Lng = g.lng; filled++; }
|
|
|
|
|
|
|
|
|
|
|
|
if (filled > 0) await _db.SaveChangesAsync(ct);
|
|
|
|
|
|
_log.LogInformation("Coordinate backfill placed {N} aggregated listings on the map.", filled);
|
|
|
|
|
|
return filled;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-21 13:19:11 +03:30
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// The self-cleaning pass run automatically at the end of every crawl (and available on demand):
|
|
|
|
|
|
/// archive out-of-scope/duplicate listings, merge duplicate + fold junk facilities, and backfill
|
|
|
|
|
|
/// missing Tehran map coords. All in-place — reversible (archive, not delete) for listings, guarded
|
|
|
|
|
|
/// (never touches employer/verified facilities) — and pure DB + CPU (no AI, no network), so it's
|
|
|
|
|
|
/// cheap to run on every ingest. Keeps the board tidy without the admin clicking the cleanup buttons.
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
public async Task<(int archived, int dedupedJobs, int mergedFac, int cleanedFac, int coords)>
|
|
|
|
|
|
RunPostIngestCleanupAsync(CancellationToken ct = default)
|
|
|
|
|
|
{
|
|
|
|
|
|
var (archived, dedupedJobs) = await PurgeInvalidAggregatedAsync(ct);
|
|
|
|
|
|
var (mergedFac, cleanedFac) = await MergeAndCleanFacilitiesAsync(ct);
|
|
|
|
|
|
var coords = await BackfillCoordsAsync(ct);
|
|
|
|
|
|
_log.LogInformation("Post-ingest cleanup: archived={A} dedupedJobs={DJ} mergedFac={MF} cleanedFac={CF} coords={C}",
|
|
|
|
|
|
archived, dedupedJobs, mergedFac, cleanedFac, coords);
|
|
|
|
|
|
return (archived, dedupedJobs, mergedFac, cleanedFac, coords);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-21 05:09:39 +03:30
|
|
|
|
/// <summary>
|
2026-06-21 05:25:51 +03:30
|
|
|
|
/// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each Open
|
|
|
|
|
|
/// listing's stored text through the CURRENT validator and ARCHIVE (Status → Archived, not delete)
|
|
|
|
|
|
/// only the ones that are now clearly out-of-scope — domestic-helper («امور منزل»),
|
|
|
|
|
|
/// promotional/training, or spam (i.e. <see cref="ValidationResult.IsSpam"/>). Merely-incomplete-
|
|
|
|
|
|
/// but-legit ads are KEPT. Then collapse near-duplicate job reposts the same way. Archiving (vs
|
|
|
|
|
|
/// hard delete) is the project convention: the row is retained for analysis and the change is
|
|
|
|
|
|
/// reversible, the listing drops out of every public screen + the sitemap (which filter Status ==
|
|
|
|
|
|
/// Open), and its detail page returns 410 Gone (the standard "permanently removed" signal Google
|
|
|
|
|
|
/// uses to deindex). Valid listings are never touched, so their IDs/URLs stay stable.
|
|
|
|
|
|
/// Returns (archived, deduped).
|
2026-06-21 05:09:39 +03:30
|
|
|
|
/// </summary>
|
2026-06-21 05:25:51 +03:30
|
|
|
|
public async Task<(int archived, int deduped)> PurgeInvalidAggregatedAsync(CancellationToken ct = default)
|
2026-06-21 05:09:39 +03:30
|
|
|
|
{
|
|
|
|
|
|
var roleNames = await _db.Roles.Select(r => r.Name).ToListAsync(ct);
|
|
|
|
|
|
var cityNames = await _db.Cities.Select(c => c.Name).ToListAsync(ct);
|
|
|
|
|
|
var districtNames = await _db.Districts.Select(d => d.Name).ToListAsync(ct);
|
|
|
|
|
|
|
|
|
|
|
|
bool IsOutOfScope(string? text)
|
|
|
|
|
|
{
|
|
|
|
|
|
var t = text ?? "";
|
|
|
|
|
|
var parsed = _parser.Parse(t, roleNames, cityNames, districtNames);
|
|
|
|
|
|
return _validator.Validate(t, parsed).IsSpam; // spam | promo | domestic-helper
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-21 05:25:51 +03:30
|
|
|
|
int archived = 0;
|
2026-06-21 05:09:39 +03:30
|
|
|
|
|
2026-06-21 05:25:51 +03:30
|
|
|
|
var jobIds = (await _db.JobOpenings.Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated)
|
2026-06-21 05:09:39 +03:30
|
|
|
|
.Select(j => new { j.Id, j.Description }).ToListAsync(ct))
|
|
|
|
|
|
.Where(j => IsOutOfScope(j.Description)).Select(j => j.Id).ToList();
|
|
|
|
|
|
if (jobIds.Count > 0)
|
2026-06-21 05:25:51 +03:30
|
|
|
|
archived += await _db.JobOpenings.Where(j => jobIds.Contains(j.Id))
|
|
|
|
|
|
.ExecuteUpdateAsync(u => u.SetProperty(j => j.Status, ShiftStatus.Archived), ct);
|
2026-06-21 05:09:39 +03:30
|
|
|
|
|
2026-06-21 05:25:51 +03:30
|
|
|
|
var shiftIds = (await _db.Shifts.Where(s => s.Status == ShiftStatus.Open && s.Source == ShiftSource.Aggregated)
|
2026-06-21 05:09:39 +03:30
|
|
|
|
.Select(s => new { s.Id, s.Description }).ToListAsync(ct))
|
|
|
|
|
|
.Where(s => IsOutOfScope(s.Description)).Select(s => s.Id).ToList();
|
|
|
|
|
|
if (shiftIds.Count > 0)
|
2026-06-21 05:25:51 +03:30
|
|
|
|
archived += await _db.Shifts.Where(s => shiftIds.Contains(s.Id))
|
|
|
|
|
|
.ExecuteUpdateAsync(u => u.SetProperty(s => s.Status, ShiftStatus.Archived), ct);
|
2026-06-21 05:09:39 +03:30
|
|
|
|
|
|
|
|
|
|
var deduped = await DedupeJobsAsync(ct);
|
2026-06-21 05:25:51 +03:30
|
|
|
|
_log.LogInformation("Purge archived {R} out-of-scope aggregated listings; deduped {D} jobs.", archived, deduped);
|
|
|
|
|
|
return (archived, deduped);
|
2026-06-21 05:09:39 +03:30
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// Collapse near-duplicate aggregated JOB reposts the exact-hash dedupe missed (same ad re-crawled
|
|
|
|
|
|
/// with slightly different surrounding text → different ContentHash). Signature = role + facility +
|
|
|
|
|
|
/// normalized description core with digits/«… پیش» time-phrases stripped. Keeps the NEWEST of each
|
2026-06-21 05:25:51 +03:30
|
|
|
|
/// group and ARCHIVES the rest (Status → Archived, reversible — same rationale as the purge).
|
|
|
|
|
|
/// Per-role fan-out of one ad is preserved (different RoleId → different signature).
|
2026-06-21 05:09:39 +03:30
|
|
|
|
/// </summary>
|
|
|
|
|
|
public async Task<int> DedupeJobsAsync(CancellationToken ct = default)
|
|
|
|
|
|
{
|
|
|
|
|
|
var rows = await _db.JobOpenings
|
|
|
|
|
|
.Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated)
|
|
|
|
|
|
.Select(j => new { j.Id, j.RoleId, j.FacilityId, j.Description, j.CreatedAt })
|
|
|
|
|
|
.ToListAsync(ct);
|
|
|
|
|
|
|
|
|
|
|
|
string? Sig(int roleId, int facId, string? desc)
|
|
|
|
|
|
{
|
|
|
|
|
|
var core = NormalizeFa(Regex.Replace(desc ?? "",
|
|
|
|
|
|
@"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim();
|
|
|
|
|
|
if (core.Length < 15) return null; // too little to call it a dup safely
|
|
|
|
|
|
return $"j:{roleId}:{facId}:{(core.Length > 120 ? core[..120] : core)}";
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-21 05:25:51 +03:30
|
|
|
|
var toArchive = rows
|
2026-06-21 05:09:39 +03:30
|
|
|
|
.Select(r => new { r.Id, r.CreatedAt, Key = Sig(r.RoleId, r.FacilityId, r.Description) })
|
|
|
|
|
|
.Where(x => x.Key is not null)
|
|
|
|
|
|
.GroupBy(x => x.Key)
|
|
|
|
|
|
.SelectMany(g => g.OrderByDescending(x => x.CreatedAt).Skip(1).Select(x => x.Id))
|
|
|
|
|
|
.ToList();
|
|
|
|
|
|
|
2026-06-21 05:25:51 +03:30
|
|
|
|
if (toArchive.Count == 0) return 0;
|
|
|
|
|
|
var archived = await _db.JobOpenings.Where(j => toArchive.Contains(j.Id))
|
|
|
|
|
|
.ExecuteUpdateAsync(u => u.SetProperty(j => j.Status, ShiftStatus.Archived), ct);
|
|
|
|
|
|
_log.LogInformation("Archived {N} near-duplicate aggregated jobs.", archived);
|
|
|
|
|
|
return archived;
|
2026-06-21 05:09:39 +03:30
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-21 05:40:29 +03:30
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// Clean up the crawl-generated facility table: (1) fold listings of junk-named facilities
|
|
|
|
|
|
/// («بیمارستان هستم», «... از مدجابز», bare «کلینیک») into the shared placeholder and delete the
|
|
|
|
|
|
/// junk record; (2) merge Persian-fuzzy duplicates («سازمان برنامه جنوبی» ×3) into one keeper,
|
|
|
|
|
|
/// repointing their shifts/jobs. HARD GUARD: only ever removes facilities that are purely
|
|
|
|
|
|
/// crawl-generated (no owner, not verified, Unverified) and never the placeholder — employer- and
|
|
|
|
|
|
/// admin-managed facilities are untouched. Listings are always repointed first, so no ad is lost.
|
|
|
|
|
|
/// Returns (merged, cleaned).
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
public async Task<(int merged, int cleaned)> MergeAndCleanFacilitiesAsync(CancellationToken ct = default)
|
|
|
|
|
|
{
|
|
|
|
|
|
var facilities = await _db.Facilities.ToListAsync(ct);
|
|
|
|
|
|
|
|
|
|
|
|
var jobCounts = await _db.JobOpenings.GroupBy(j => j.FacilityId)
|
|
|
|
|
|
.Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct);
|
|
|
|
|
|
var shiftCounts = await _db.Shifts.GroupBy(s => s.FacilityId)
|
|
|
|
|
|
.Select(g => new { g.Key, C = g.Count() }).ToDictionaryAsync(x => x.Key, x => x.C, ct);
|
|
|
|
|
|
int Listings(int id) => jobCounts.GetValueOrDefault(id) + shiftCounts.GetValueOrDefault(id);
|
|
|
|
|
|
|
2026-06-21 07:17:24 +03:30
|
|
|
|
// The shared "unknown" placeholder is worded differently in older data
|
|
|
|
|
|
// («مرکز درمانی (نامشخص)») than the current constant, so an exact-name lookup found nothing and
|
|
|
|
|
|
// the junk-fold step silently no-op'd. Match by the «نامشخص» marker and pick the bucket actually
|
|
|
|
|
|
// used by the most listings — that's the real placeholder junk should fold into.
|
|
|
|
|
|
var placeholder = facilities
|
|
|
|
|
|
.Where(f => f.Name == UnknownFacilityName || FacilityMatcher.Normalize(f.Name).Contains("نامشخص"))
|
|
|
|
|
|
.OrderByDescending(f => Listings(f.Id)).FirstOrDefault();
|
|
|
|
|
|
var placeholderId = placeholder?.Id ?? -1;
|
|
|
|
|
|
|
2026-06-21 05:40:29 +03:30
|
|
|
|
// Removable = purely crawl-generated and unmanaged. Never the placeholder, an owned, or a
|
|
|
|
|
|
// verified facility (those carry real employer data / verification).
|
|
|
|
|
|
bool Removable(Facility f) => f.OwnerUserId is null && !f.IsVerified
|
|
|
|
|
|
&& f.Verification == VerificationStatus.Unverified
|
|
|
|
|
|
&& (placeholder is null || f.Id != placeholder.Id);
|
|
|
|
|
|
|
|
|
|
|
|
async Task AbsorbAsync(int fromId, int toId)
|
|
|
|
|
|
{
|
|
|
|
|
|
await _db.Shifts.Where(s => s.FacilityId == fromId)
|
|
|
|
|
|
.ExecuteUpdateAsync(u => u.SetProperty(s => s.FacilityId, toId), ct);
|
|
|
|
|
|
await _db.JobOpenings.Where(j => j.FacilityId == fromId)
|
|
|
|
|
|
.ExecuteUpdateAsync(u => u.SetProperty(j => j.FacilityId, toId), ct);
|
|
|
|
|
|
await _db.Facilities.Where(f => f.Id == fromId).ExecuteDeleteAsync(ct); // cascades stray docs/reviews
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int merged = 0, cleaned = 0;
|
|
|
|
|
|
|
|
|
|
|
|
// 1) Junk-named crawl facilities → fold into the shared placeholder.
|
|
|
|
|
|
if (placeholder is not null)
|
|
|
|
|
|
foreach (var f in facilities.Where(f => Removable(f) && FacilityMatcher.IsJunkName(f.Name)).ToList())
|
|
|
|
|
|
{
|
|
|
|
|
|
await AbsorbAsync(f.Id, placeholder.Id);
|
|
|
|
|
|
cleaned++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-21 07:17:24 +03:30
|
|
|
|
// 2) Merge same-city Persian-fuzzy duplicates into the best keeper (never the placeholder).
|
|
|
|
|
|
var remaining = (await _db.Facilities.ToListAsync(ct)).Where(f => f.Id != placeholderId).ToList();
|
2026-06-21 05:40:29 +03:30
|
|
|
|
var done = new HashSet<int>();
|
|
|
|
|
|
foreach (var f in remaining)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (done.Contains(f.Id)) continue;
|
|
|
|
|
|
done.Add(f.Id);
|
|
|
|
|
|
var cluster = remaining.Where(o => o.Id != f.Id && !done.Contains(o.Id)
|
|
|
|
|
|
&& o.CityId == f.CityId && FacilityMatcher.IsSame(o.Name, f.Name)).ToList();
|
|
|
|
|
|
if (cluster.Count == 0) continue;
|
|
|
|
|
|
cluster.Add(f);
|
|
|
|
|
|
// keeper: verified > owned > most listings > lowest id (oldest).
|
|
|
|
|
|
var keeper = cluster.OrderByDescending(x => x.IsVerified)
|
|
|
|
|
|
.ThenByDescending(x => x.OwnerUserId.HasValue)
|
|
|
|
|
|
.ThenByDescending(x => Listings(x.Id)).ThenBy(x => x.Id).First();
|
|
|
|
|
|
foreach (var dup in cluster.Where(x => x.Id != keeper.Id))
|
|
|
|
|
|
{
|
|
|
|
|
|
done.Add(dup.Id);
|
|
|
|
|
|
if (!Removable(dup)) continue; // never delete an employer/verified facility
|
|
|
|
|
|
await AbsorbAsync(dup.Id, keeper.Id);
|
|
|
|
|
|
merged++;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
_log.LogInformation("Facility cleanup: merged {M} duplicates, removed {C} junk facilities.", merged, cleaned);
|
|
|
|
|
|
return (merged, cleaned);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-21 18:06:22 +03:30
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// In-place fix for EXISTING aggregated listings the AI mislabeled «پزشک عمومی» when the ad text
|
|
|
|
|
|
/// actually names a more specific role (dentist, endocrinologist/«متخصص», lab, …). Re-runs the
|
|
|
|
|
|
/// keyword parser + the same doctor-role guard over the stored text and updates RoleId (and the
|
|
|
|
|
|
/// generic «استخدام پزشک عمومی» title) IN PLACE — no AI call, no delete/recreate, so IDs and
|
|
|
|
|
|
/// indexed URLs are untouched. Only ever changes rows currently labeled «پزشک عمومی». Returns the
|
|
|
|
|
|
/// number corrected.
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
public async Task<int> RecorrectDoctorRolesAsync(CancellationToken ct = default)
|
|
|
|
|
|
{
|
|
|
|
|
|
var roles = await _db.Roles.ToListAsync(ct);
|
|
|
|
|
|
var roleNames = roles.Select(r => r.Name).ToList();
|
|
|
|
|
|
var cityNames = await _db.Cities.Select(c => c.Name).ToListAsync(ct);
|
|
|
|
|
|
var districtNames = await _db.Districts.Select(d => d.Name).ToListAsync(ct);
|
|
|
|
|
|
var gp = roles.FirstOrDefault(r => r.Name == "پزشک عمومی");
|
|
|
|
|
|
if (gp is null) return 0;
|
|
|
|
|
|
|
|
|
|
|
|
Role? Corrected(string? text)
|
|
|
|
|
|
{
|
|
|
|
|
|
var parsed = _parser.Parse(text ?? "", roleNames, cityNames, districtNames);
|
|
|
|
|
|
var specific = parsed.RoleNames.FirstOrDefault(n => NormalizeFa(n) != NormalizeFa("پزشک عمومی"));
|
|
|
|
|
|
if (specific is not null) return ResolveOrCreateRole(roles, specific, null);
|
|
|
|
|
|
if (LooksSpecialist(text)) return ResolveOrCreateRole(roles, "پزشک متخصص", "پزشک");
|
|
|
|
|
|
return null;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int fixedCount = 0;
|
|
|
|
|
|
|
|
|
|
|
|
var jobs = await _db.JobOpenings
|
|
|
|
|
|
.Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated && j.RoleId == gp.Id)
|
|
|
|
|
|
.ToListAsync(ct);
|
|
|
|
|
|
foreach (var j in jobs)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (Corrected(j.Description) is { } nr && nr.Id != j.RoleId)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (string.IsNullOrWhiteSpace(j.Title) || j.Title == "استخدام پزشک عمومی") j.Title = $"استخدام {nr.Name}";
|
|
|
|
|
|
j.RoleId = nr.Id; fixedCount++;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
var talent = await _db.TalentListings
|
|
|
|
|
|
.Where(t => t.Status == ShiftStatus.Open && t.Source == ShiftSource.Aggregated && t.RoleId == gp.Id)
|
|
|
|
|
|
.ToListAsync(ct);
|
|
|
|
|
|
foreach (var t in talent)
|
|
|
|
|
|
if (Corrected(t.Description) is { } nr && nr.Id != t.RoleId) { t.RoleId = nr.Id; fixedCount++; }
|
|
|
|
|
|
|
|
|
|
|
|
if (fixedCount > 0) await _db.SaveChangesAsync(ct);
|
|
|
|
|
|
_log.LogInformation("Recorrected {N} «پزشک عمومی»-mislabeled aggregated listings.", fixedCount);
|
|
|
|
|
|
return fixedCount;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-20 17:54:26 +03:30
|
|
|
|
private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray());
|
|
|
|
|
|
|
2026-06-03 17:41:02 +03:30
|
|
|
|
private static (RawListingStatus status, string? reason, int confidence) Decide(
|
|
|
|
|
|
AppSetting s, ValidationResult val, AiAuditResult? ai)
|
|
|
|
|
|
{
|
|
|
|
|
|
var notes = val.Issues.Count > 0 ? string.Join("؛ ", val.Issues) : null;
|
|
|
|
|
|
|
|
|
|
|
|
if (val.IsSpam)
|
|
|
|
|
|
return (RawListingStatus.Discarded, Join("اسپم", notes), val.Confidence);
|
|
|
|
|
|
|
|
|
|
|
|
if (ai is not null)
|
|
|
|
|
|
{
|
|
|
|
|
|
var aiNote = Join($"AI: {ai.Decision} ({ai.Confidence}٪)" + (ai.Reason is null ? "" : $" — {ai.Reason}"), notes);
|
|
|
|
|
|
if (ai.Reject) return (RawListingStatus.Discarded, aiNote, ai.Confidence);
|
|
|
|
|
|
if (ai.Approve)
|
2026-06-09 21:38:55 +03:30
|
|
|
|
{
|
|
|
|
|
|
// MEDICAL GATE: the rule-validator's medical signal vetoes an AI approval. The AI can
|
|
|
|
|
|
// hallucinate (e.g. approved a GeekVape product ad 95% as a «پرستار» job) — when our
|
|
|
|
|
|
// own keyword/role check sees nothing clinical, never auto-publish; send to review.
|
|
|
|
|
|
if (!val.LooksMedical)
|
|
|
|
|
|
return (RawListingStatus.Flagged, Join("هوش مصنوعی تأیید کرد ولی نشانهٔ کادر درمان یافت نشد — بررسی دستی", aiNote), ai.Confidence);
|
2026-06-03 17:41:02 +03:30
|
|
|
|
return (s.Mode == IngestionMode.Automatic && s.AiAutoApprove
|
|
|
|
|
|
? RawListingStatus.Normalized : RawListingStatus.New, aiNote, ai.Confidence);
|
2026-06-09 21:38:55 +03:30
|
|
|
|
}
|
2026-06-03 17:41:02 +03:30
|
|
|
|
return (RawListingStatus.Flagged, aiNote, ai.Confidence); // review
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (!val.IsValid) return (RawListingStatus.Flagged, notes, val.Confidence);
|
|
|
|
|
|
if (s.Mode == IngestionMode.Automatic && val.Confidence >= s.AutoPublishMinConfidence)
|
|
|
|
|
|
return (RawListingStatus.Normalized, notes, val.Confidence);
|
|
|
|
|
|
return (RawListingStatus.New, notes, val.Confidence);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private void Publish(ParsedListing parsed, AiAuditResult? ai, RawListing raw,
|
2026-06-08 07:14:48 +03:30
|
|
|
|
List<Role> roles, List<City> cities, List<District> districts, List<Facility> facilities)
|
2026-06-03 17:41:02 +03:30
|
|
|
|
{
|
|
|
|
|
|
var d = ai?.Data;
|
|
|
|
|
|
var cityName = d?.City ?? parsed.CityName;
|
|
|
|
|
|
var districtName = d?.District ?? parsed.DistrictName;
|
|
|
|
|
|
|
2026-06-08 10:58:29 +03:30
|
|
|
|
// One ad can name several roles («پرستار سالمند و کودک و همراه بیمار») — resolve them all
|
|
|
|
|
|
// and publish one listing per role so each is browsable/filterable. Capped to avoid spam.
|
2026-06-09 19:04:24 +03:30
|
|
|
|
// The AI's role (+ its category) is the trusted, possibly-new one; parser names are already
|
|
|
|
|
|
// canonical matches. Unknown roles are CREATED (dynamic taxonomy), not dropped.
|
|
|
|
|
|
var candidates = new List<(string name, string? category)>();
|
|
|
|
|
|
if (!string.IsNullOrWhiteSpace(d?.Role)) candidates.Add((d!.Role!.Trim(), d.Category));
|
|
|
|
|
|
foreach (var n in parsed.RoleNames) candidates.Add((n, null));
|
|
|
|
|
|
if (parsed.RoleName is not null) candidates.Add((parsed.RoleName, null));
|
|
|
|
|
|
|
|
|
|
|
|
var pubRoles = new List<Role>();
|
|
|
|
|
|
foreach (var (name, category) in candidates)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (string.IsNullOrWhiteSpace(name)) continue;
|
|
|
|
|
|
var role = ResolveOrCreateRole(roles, name, category);
|
|
|
|
|
|
if (!pubRoles.Contains(role)) pubRoles.Add(role);
|
|
|
|
|
|
if (pubRoles.Count >= 4) break;
|
|
|
|
|
|
}
|
2026-06-08 10:58:29 +03:30
|
|
|
|
if (pubRoles.Count == 0) pubRoles.Add(roles.First());
|
|
|
|
|
|
|
2026-06-21 18:01:58 +03:30
|
|
|
|
// Doctor-role guard. «پزشک عمومی» is the AI's fallback when it's unsure, so it mislabels
|
|
|
|
|
|
// clearly-specific doctor ads — a dentist ad («دعوت به همکاری دندانپزشک») or an ENT/specialist
|
|
|
|
|
|
// one published as «استخدام پزشک عمومی». Rather than patch role-by-role, trust the keyword
|
|
|
|
|
|
// parser: if IT already found a more specific role in the same text, use that; otherwise fall
|
|
|
|
|
|
// back to «پزشک متخصص» when the text says specialist. Only ever overrides the weak GP default.
|
|
|
|
|
|
if (pubRoles[0].Name == "پزشک عمومی")
|
2026-06-21 17:59:24 +03:30
|
|
|
|
{
|
2026-06-21 18:01:58 +03:30
|
|
|
|
var specific = parsed.RoleNames.FirstOrDefault(n => NormalizeFa(n) != NormalizeFa("پزشک عمومی"));
|
|
|
|
|
|
if (specific is not null)
|
|
|
|
|
|
pubRoles[0] = ResolveOrCreateRole(roles, specific, null);
|
|
|
|
|
|
else if (LooksSpecialist(raw.RawText))
|
2026-06-21 17:59:24 +03:30
|
|
|
|
pubRoles[0] = ResolveOrCreateRole(roles, "پزشک متخصص", "پزشک");
|
|
|
|
|
|
}
|
2026-06-21 13:29:43 +03:30
|
|
|
|
|
2026-06-03 17:41:02 +03:30
|
|
|
|
var city = cities.FirstOrDefault(c => c.Name == cityName)
|
|
|
|
|
|
?? cities.FirstOrDefault(c => c.IsActive) ?? cities.First();
|
|
|
|
|
|
var district = districts.FirstOrDefault(x => x.Name == districtName && x.CityId == city.Id);
|
|
|
|
|
|
|
2026-06-20 15:31:27 +03:30
|
|
|
|
// Approx. coords for the map: the source ad's point (Divar) when present; otherwise, for a
|
|
|
|
|
|
// Tehran ad that only NAMES a neighborhood (Medjobs/Telegram), geocode that name to a rough
|
|
|
|
|
|
// center. Shown as a «محدودهٔ تقریبی» circle, never a precise pin.
|
|
|
|
|
|
double? appLat = raw.Lat, appLng = raw.Lng;
|
2026-06-21 05:09:39 +03:30
|
|
|
|
// Geocode from the structured location fields first, then fall back to scanning the ad body
|
|
|
|
|
|
// itself — many Tehran ads name the neighbourhood only in free text («… نیم ساعت پیش در سهروردی»)
|
|
|
|
|
|
// and never populate a district/area field, which is why most aggregated listings had no map.
|
2026-06-20 15:31:27 +03:30
|
|
|
|
if (appLat is null && city.Name == "تهران"
|
2026-06-21 05:09:39 +03:30
|
|
|
|
&& TehranGeo.Locate(district?.Name, districtName, parsed.AreaNote, raw.RawText) is { } g)
|
2026-06-20 15:31:27 +03:30
|
|
|
|
{ appLat = g.lat; appLng = g.lng; }
|
2026-06-20 15:48:42 +03:30
|
|
|
|
// Last resort — the AI model's inferred coords, but ONLY when they fall inside greater Tehran
|
|
|
|
|
|
// (rejects a hallucinated point elsewhere). Uses the registered model where the rules can't decide.
|
|
|
|
|
|
if (appLat is null && d?.Lat is double aLat && d?.Lng is double aLng && InTehran(aLat, aLng))
|
|
|
|
|
|
{ appLat = aLat; appLng = aLng; }
|
2026-06-20 15:31:27 +03:30
|
|
|
|
|
2026-06-08 08:01:12 +03:30
|
|
|
|
var kindStr = (d?.Kind ?? parsed.Kind.ToString()).ToLowerInvariant();
|
|
|
|
|
|
|
|
|
|
|
|
// «آماده به کار» — a worker offering themselves. No facility involved.
|
|
|
|
|
|
if (parsed.Kind == ListingKind.Talent || kindStr.Contains("talent") || kindStr.Contains("آماده"))
|
|
|
|
|
|
{
|
2026-06-20 14:24:20 +03:30
|
|
|
|
// ONE person = ONE listing. Do NOT fan out across roles: an applicant has a single
|
|
|
|
|
|
// profession, and «پرستار» + «پرستار کودک» from the same ad were producing duplicate
|
|
|
|
|
|
// cards. Use the primary (AI) role; any secondary role names become searchable tags.
|
|
|
|
|
|
var role = pubRoles[0];
|
|
|
|
|
|
var extraRoleTags = pubRoles.Skip(1).Select(r => r.Name);
|
2026-06-08 08:11:14 +03:30
|
|
|
|
var tPay = d?.PayAmount ?? parsed.PayAmount;
|
|
|
|
|
|
var tShare = d?.SharePercent ?? parsed.SharePercent;
|
2026-06-20 14:24:20 +03:30
|
|
|
|
_db.TalentListings.Add(new TalentListing
|
|
|
|
|
|
{
|
|
|
|
|
|
Role = role, City = city, DistrictId = district?.Id,
|
|
|
|
|
|
PersonName = !string.IsNullOrWhiteSpace(d?.PersonName) ? d!.PersonName!.Trim() : parsed.PersonName,
|
|
|
|
|
|
YearsExperience = d?.YearsExperience ?? parsed.YearsExperience,
|
|
|
|
|
|
IsLicensed = d?.IsLicensed ?? parsed.IsLicensed,
|
|
|
|
|
|
AreaNote = parsed.AreaNote,
|
|
|
|
|
|
Availability = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
|
|
|
|
|
|
Gender = parsed.Gender,
|
|
|
|
|
|
PayType = tShare is not null && tPay is null ? PayType.Percentage
|
|
|
|
|
|
: tPay is null ? PayType.Negotiable : PayType.PerShift,
|
|
|
|
|
|
PayAmount = tPay, SharePercent = tShare,
|
|
|
|
|
|
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone,
|
|
|
|
|
|
Description = raw.RawText,
|
|
|
|
|
|
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
|
2026-06-20 15:31:27 +03:30
|
|
|
|
Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center
|
2026-06-20 14:24:20 +03:30
|
|
|
|
Contacts = BuildContacts(d, parsed),
|
|
|
|
|
|
Tags = BuildTags(parsed, d, role, city, extraRoleTags),
|
|
|
|
|
|
});
|
2026-06-08 08:01:12 +03:30
|
|
|
|
raw.Status = RawListingStatus.Normalized;
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-09 21:38:55 +03:30
|
|
|
|
// Never surface the crawl source (e.g. «مدجابز») in a public facility name. An unnamed ad
|
|
|
|
|
|
// falls back to ONE shared placeholder (same string as the manual-review flow, so both
|
|
|
|
|
|
// pipelines reuse a single record). That placeholder is shared by every unnamed ad in a
|
|
|
|
|
|
// city, so it must NEVER receive a single ad's fuzzy coords — that would mis-place dozens of
|
|
|
|
|
|
// unrelated listings on the map and in «near me». Mirrors Review.ResolveFacilityIdAsync.
|
|
|
|
|
|
bool unnamed = string.IsNullOrWhiteSpace(d?.FacilityName) && string.IsNullOrWhiteSpace(parsed.FacilityName);
|
2026-06-03 17:41:02 +03:30
|
|
|
|
var facilityName = !string.IsNullOrWhiteSpace(d?.FacilityName) ? d!.FacilityName!.Trim()
|
2026-06-08 07:14:48 +03:30
|
|
|
|
: !string.IsNullOrWhiteSpace(parsed.FacilityName) ? parsed.FacilityName!.Trim()
|
2026-06-09 21:38:55 +03:30
|
|
|
|
: UnknownFacilityName;
|
2026-06-08 07:14:48 +03:30
|
|
|
|
// Reuse an existing facility (exact or Persian-aware fuzzy match) before creating a new one.
|
|
|
|
|
|
var facility = FacilityMatcher.FindBest(facilities, facilityName, city.Id);
|
2026-06-03 17:41:02 +03:30
|
|
|
|
if (facility is null)
|
|
|
|
|
|
{
|
|
|
|
|
|
facility = new Facility
|
|
|
|
|
|
{
|
|
|
|
|
|
Name = facilityName, Type = FacilityType.Clinic, City = city, DistrictId = district?.Id,
|
2026-06-08 08:11:14 +03:30
|
|
|
|
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone, IsVerified = false,
|
2026-06-09 21:38:55 +03:30
|
|
|
|
Lat = unnamed ? null : raw.Lat, Lng = unnamed ? null : raw.Lng, // approx. Divar map center
|
2026-06-03 17:41:02 +03:30
|
|
|
|
};
|
|
|
|
|
|
_db.Facilities.Add(facility);
|
2026-06-08 07:14:48 +03:30
|
|
|
|
facilities.Add(facility); // so later listings in this run match it too
|
2026-06-03 17:41:02 +03:30
|
|
|
|
}
|
2026-06-09 21:38:55 +03:30
|
|
|
|
else if (!unnamed && facility.Lat is null && facility.Lng is null && raw.Lat is not null)
|
|
|
|
|
|
{
|
|
|
|
|
|
// Backfill coords only when the matched (real, named) facility has none — never overwrite a
|
|
|
|
|
|
// real (employer-set or verified) location with Divar's fuzzy point.
|
|
|
|
|
|
facility.Lat = raw.Lat; facility.Lng = raw.Lng;
|
|
|
|
|
|
}
|
2026-06-03 17:41:02 +03:30
|
|
|
|
|
2026-06-08 08:01:12 +03:30
|
|
|
|
if (kindStr.Contains("job") || kindStr.Contains("استخدام"))
|
2026-06-03 17:41:02 +03:30
|
|
|
|
{
|
2026-06-08 10:58:29 +03:30
|
|
|
|
foreach (var role in pubRoles)
|
|
|
|
|
|
_db.JobOpenings.Add(new JobOpening
|
|
|
|
|
|
{
|
|
|
|
|
|
Facility = facility, Role = role,
|
|
|
|
|
|
Title = !string.IsNullOrWhiteSpace(d?.Title) && pubRoles.Count == 1 ? d!.Title!.Trim() : $"استخدام {role.Name}",
|
|
|
|
|
|
EmploymentType = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
|
2026-06-21 05:09:39 +03:30
|
|
|
|
// Prefer the AI-extracted salary, falling back to the parser's — matching the talent
|
|
|
|
|
|
// path. (Jobs previously used only parsed.PayAmount, silently dropping the AI figure,
|
|
|
|
|
|
// so every aggregated opening showed «توافقی» even when the ad stated a number.)
|
|
|
|
|
|
SalaryMin = d?.PayAmount ?? parsed.PayAmount,
|
2026-06-08 10:58:29 +03:30
|
|
|
|
Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated,
|
|
|
|
|
|
SourceUrl = raw.SourceUrl,
|
2026-06-20 15:31:27 +03:30
|
|
|
|
Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center
|
2026-06-10 21:28:12 +03:30
|
|
|
|
Contacts = BuildContacts(d, parsed), // the ad's OWN number(s) — fresh per listing
|
2026-06-08 10:58:29 +03:30
|
|
|
|
});
|
2026-06-03 17:41:02 +03:30
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
var st = MapShiftType(d?.ShiftType, parsed.ShiftType);
|
|
|
|
|
|
var (start, end) = DefaultTimes(st);
|
2026-06-08 10:58:29 +03:30
|
|
|
|
foreach (var role in pubRoles)
|
|
|
|
|
|
_db.Shifts.Add(new Shift
|
|
|
|
|
|
{
|
|
|
|
|
|
Facility = facility, Role = role,
|
|
|
|
|
|
Date = DateOnly.FromDateTime(DateTime.UtcNow).AddDays(1),
|
|
|
|
|
|
StartTime = start, EndTime = end, ShiftType = st,
|
|
|
|
|
|
SpecialtyRequired = role.Name, Description = raw.RawText,
|
|
|
|
|
|
PayType = parsed.SharePercent is not null && parsed.PayAmount is null ? PayType.Percentage
|
|
|
|
|
|
: parsed.PayAmount is null ? PayType.Negotiable : PayType.PerShift,
|
|
|
|
|
|
PayAmount = parsed.PayAmount, SharePercent = parsed.SharePercent,
|
|
|
|
|
|
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
|
2026-06-20 15:31:27 +03:30
|
|
|
|
Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center
|
2026-06-10 21:28:12 +03:30
|
|
|
|
Contacts = BuildContacts(d, parsed), // the ad's OWN number(s) — fresh per listing
|
2026-06-08 10:58:29 +03:30
|
|
|
|
});
|
2026-06-03 17:41:02 +03:30
|
|
|
|
}
|
|
|
|
|
|
raw.Status = RawListingStatus.Normalized;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-09 19:04:24 +03:30
|
|
|
|
/// <summary>Space-separated searchable tags: parsed cert/skill tags + AI-detected skills/requirements
|
2026-06-20 14:24:20 +03:30
|
|
|
|
/// + secondary role names + this listing's role/category + city. Pay/contact/location noise and
|
|
|
|
|
|
/// sentence fragments are filtered out so chips stay clinical. Drives deep search + tag chips.</summary>
|
|
|
|
|
|
private static string BuildTags(ParsedListing parsed, AiStructured? d, Role role, City city,
|
|
|
|
|
|
IEnumerable<string>? extraRoles = null)
|
2026-06-08 11:25:32 +03:30
|
|
|
|
{
|
2026-06-09 19:04:24 +03:30
|
|
|
|
var tags = new List<string>(parsed.Tags) { role.Name, role.Category, city.Name };
|
2026-06-20 14:24:20 +03:30
|
|
|
|
if (extraRoles is not null) tags.AddRange(extraRoles);
|
2026-06-09 19:04:24 +03:30
|
|
|
|
if (d?.Tags is not null)
|
|
|
|
|
|
tags.AddRange(d.Tags.Where(t => !string.IsNullOrWhiteSpace(t)).Select(t => t.Trim()));
|
2026-06-20 14:24:20 +03:30
|
|
|
|
return string.Join(" ", tags
|
|
|
|
|
|
.Where(t => !string.IsNullOrWhiteSpace(t) && !IsNoiseTag(t))
|
|
|
|
|
|
.Select(t => t.Trim())
|
|
|
|
|
|
.Distinct());
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Words/phrases that are NOT clinical skills — pay, contact, generic verbs, sentence fragments —
|
|
|
|
|
|
// that were polluting the tag chips («پرداخت توافقی»، «مراقبت از»…).
|
|
|
|
|
|
private static readonly string[] TagStopWords =
|
|
|
|
|
|
{
|
|
|
|
|
|
"توافقی", "پرداخت", "پرداخت توافقی", "حقوق", "دستمزد", "تماس", "شماره", "شماره تماس",
|
|
|
|
|
|
"مراقبت از", "مراقبت", "همکاری", "آماده", "آماده به کار", "نیرو", "استخدام", "جذب",
|
2026-06-20 15:41:06 +03:30
|
|
|
|
// personality / filler — not clinical skills
|
|
|
|
|
|
"خوشاخلاق", "خوش اخلاق", "خوشاخلاق", "دلسوز", "منظم", "مسئولیتپذیر", "مسئولیت پذیر", "باتجربه", "مجرب",
|
2026-06-20 14:24:20 +03:30
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
private static bool IsNoiseTag(string tag)
|
|
|
|
|
|
{
|
|
|
|
|
|
var t = NormalizeFa(tag);
|
|
|
|
|
|
if (t.Length < 2 || t.EndsWith(" از") || t.EndsWith("-از")) return true; // dangling «… از»
|
|
|
|
|
|
return TagStopWords.Any(w => NormalizeFa(w) == t);
|
2026-06-08 11:25:32 +03:30
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-09 21:38:55 +03:30
|
|
|
|
/// <summary>Resolve a role name to an existing Role; if it's genuinely new, create it (dynamic
|
|
|
|
|
|
/// taxonomy). Matching is layered so a differently-worded-but-same-meaning role maps to the
|
|
|
|
|
|
/// canonical one instead of forking: (1) exact normalized name, (2) synonym/abbreviation alias
|
|
|
|
|
|
/// → canonical (دکتر→پزشک عمومی، نرس→پرستار…), (3) create. Only TRUE synonyms collapse — real
|
|
|
|
|
|
/// sub-specialties («پرستار ICU») stay distinct on purpose.</summary>
|
2026-06-09 19:04:24 +03:30
|
|
|
|
private Role ResolveOrCreateRole(List<Role> roles, string name, string? category)
|
|
|
|
|
|
{
|
2026-06-20 15:41:06 +03:30
|
|
|
|
// Drop gender/seniority modifiers baked into the role («پرستار آقا»→«پرستار»,
|
|
|
|
|
|
// «کارآموز تکنسین داروخانه»→«تکنسین داروخانه»). None of the real roles contain these tokens,
|
|
|
|
|
|
// so it only collapses sprawl — the modifier still lives on as a tag / the Gender field.
|
|
|
|
|
|
name = StripRoleModifiers(name);
|
2026-06-09 19:04:24 +03:30
|
|
|
|
var norm = NormalizeFa(name);
|
2026-06-09 21:38:55 +03:30
|
|
|
|
|
|
|
|
|
|
// (1) Already a known role (same word or spelling variant).
|
2026-06-09 19:04:24 +03:30
|
|
|
|
var match = roles.FirstOrDefault(r => NormalizeFa(r.Name) == norm);
|
|
|
|
|
|
if (match is not null) return match;
|
|
|
|
|
|
|
2026-06-09 21:38:55 +03:30
|
|
|
|
// (2) A synonym of a canonical role → use that role; don't create a duplicate.
|
|
|
|
|
|
if (RoleAliases.TryGetValue(norm, out var canonical))
|
|
|
|
|
|
{
|
|
|
|
|
|
var canonNorm = NormalizeFa(canonical);
|
|
|
|
|
|
var aliased = roles.FirstOrDefault(r => NormalizeFa(r.Name) == canonNorm);
|
|
|
|
|
|
if (aliased is not null) return aliased;
|
|
|
|
|
|
name = canonical; norm = canonNorm; // canonical not seeded yet → create under its proper name
|
|
|
|
|
|
}
|
2026-06-09 19:04:24 +03:30
|
|
|
|
|
2026-06-09 21:38:55 +03:30
|
|
|
|
// (3) Genuinely new role — create it under a canonical-resolved category.
|
2026-06-09 19:04:24 +03:30
|
|
|
|
var created = new Role
|
|
|
|
|
|
{
|
2026-06-09 21:38:55 +03:30
|
|
|
|
Name = Clamp(name.Trim(), 100), // respect Role.Name MaxLength(100)
|
2026-06-20 14:24:20 +03:30
|
|
|
|
Category = Clamp(ResolveCategory(category), 50), // closed set → respect MaxLength(50)
|
2026-06-09 19:04:24 +03:30
|
|
|
|
IsActive = true,
|
|
|
|
|
|
SortOrder = (roles.Count == 0 ? 0 : roles.Max(r => r.SortOrder)) + 1,
|
|
|
|
|
|
};
|
|
|
|
|
|
_db.Roles.Add(created);
|
|
|
|
|
|
roles.Add(created); // reuse within this run (saved with the batch at end of source)
|
|
|
|
|
|
_log.LogInformation("Ingestion introduced new role «{Role}» (category «{Category}») from AI.",
|
|
|
|
|
|
created.Name, created.Category);
|
|
|
|
|
|
return created;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-20 14:24:20 +03:30
|
|
|
|
/// <summary>Map an AI-suggested category to one of the FIXED groups (پزشک/پرستار/ماما/تکنسین/
|
|
|
|
|
|
/// دندانپزشک). Categories are a closed taxonomy — they drive the filter chips — so unlike roles
|
|
|
|
|
|
/// they are NEVER invented: a synonym resolves to its canonical group, anything else → «سایر».
|
|
|
|
|
|
/// (CategoryAliases maps each canonical group to itself, so exact matches resolve here too.)</summary>
|
|
|
|
|
|
private static string ResolveCategory(string? category)
|
|
|
|
|
|
=> CategoryAliases.TryGetValue(NormalizeFa(category), out var canonical) ? canonical : "سایر";
|
2026-06-09 21:38:55 +03:30
|
|
|
|
|
|
|
|
|
|
// Synonyms/abbreviations → canonical ROLE name, so the AI naming a role differently maps onto an
|
|
|
|
|
|
// existing role instead of forking the taxonomy. Keys are matched after NormalizeFa. Add freely.
|
|
|
|
|
|
private static readonly Dictionary<string, string> RoleAliases = BuildAliasMap(new()
|
|
|
|
|
|
{
|
|
|
|
|
|
["پزشک عمومی"] = new[] { "دکتر", "طبیب", "پزشک", "جی پی", "gp", "general practitioner" },
|
|
|
|
|
|
["پزشک متخصص"] = new[] { "متخصص", "فوق تخصص", "اسپشالیست", "specialist" },
|
|
|
|
|
|
["پرستار"] = new[] { "نرس", "nurse", "پرستاری", "کارشناس پرستاری" },
|
|
|
|
|
|
["پرستار سالمندان"] = new[] { "مراقب سالمند", "مراقب سالمندان", "پرستار سالمند", "نگهدار سالمند", "مراقبت سالمند" },
|
|
|
|
|
|
["ماما"] = new[] { "مامایی", "کارشناس مامایی", "midwife" },
|
|
|
|
|
|
["تکنسین اتاق عمل"] = new[] { "اتاق عمل", "اسکراب", "scrub", "تکنولوژیست اتاق عمل" },
|
|
|
|
|
|
["تکنسین فوریتهای پزشکی"] = new[] { "فوریت پزشکی", "تکنسین اورژانس", "پارامدیک", "paramedic", "emt", "اورژانس ۱۱۵" },
|
|
|
|
|
|
["کارشناس آزمایشگاه"] = new[] { "علوم آزمایشگاهی", "تکنسین آزمایشگاه", "آزمایشگاهی", "لابراتوار", "lab", "laboratory" },
|
|
|
|
|
|
["دندانپزشک"] = new[] { "دندان پزشک", "دندون پزشک", "dentist" },
|
2026-06-20 15:41:06 +03:30
|
|
|
|
["کمک بهیار"] = new[] { "کمکیار", "کمکیار", "کمک یار", "کمکبهیار", "کمک بیمار" },
|
2026-06-09 21:38:55 +03:30
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
// Synonyms → canonical CATEGORY (the role-group used for filters/chips).
|
|
|
|
|
|
private static readonly Dictionary<string, string> CategoryAliases = BuildAliasMap(new()
|
|
|
|
|
|
{
|
|
|
|
|
|
["پزشک"] = new[] { "دکتر", "طبیب", "doctor", "پزشکی" },
|
|
|
|
|
|
["پرستار"] = new[] { "پرستاری", "nurse", "nursing" },
|
|
|
|
|
|
["ماما"] = new[] { "مامایی", "midwifery" },
|
|
|
|
|
|
["تکنسین"] = new[] { "تکنیسین", "تکنولوژیست", "technician", "کاردان فنی" },
|
|
|
|
|
|
["دندانپزشک"] = new[] { "دندان پزشک", "دندانپزشکی", "dental" },
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>Flatten {canonical → [synonyms]} into a {normalized synonym → canonical} lookup,
|
|
|
|
|
|
/// also mapping each canonical's own normalized form to itself.</summary>
|
|
|
|
|
|
private static Dictionary<string, string> BuildAliasMap(Dictionary<string, string[]> src)
|
|
|
|
|
|
{
|
|
|
|
|
|
var map = new Dictionary<string, string>();
|
|
|
|
|
|
foreach (var (canonical, aliases) in src)
|
|
|
|
|
|
{
|
|
|
|
|
|
map[NormalizeFa(canonical)] = canonical;
|
|
|
|
|
|
foreach (var a in aliases) map[NormalizeFa(a)] = canonical;
|
|
|
|
|
|
}
|
|
|
|
|
|
return map;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-09 19:04:24 +03:30
|
|
|
|
/// <summary>Normalize a Persian string for dedupe: unify Arabic/Persian ي→ی and ك→ک, drop ZWNJ,
|
|
|
|
|
|
/// collapse whitespace, trim, lowercase (so Latin tags like "ICU"/"icu" also match).</summary>
|
|
|
|
|
|
private static string NormalizeFa(string? s) => Regex.Replace(
|
|
|
|
|
|
(s ?? "").Replace('ي', 'ی').Replace('ك', 'ک').Replace('', ' ').Trim(),
|
|
|
|
|
|
@"\s+", " ").ToLowerInvariant();
|
|
|
|
|
|
|
|
|
|
|
|
private static string Clamp(string s, int max) => s.Length <= max ? s : s[..max].Trim();
|
|
|
|
|
|
|
2026-06-20 15:48:42 +03:30
|
|
|
|
/// <summary>Greater-Tehran bounding box — rejects out-of-area (hallucinated) AI coordinates.</summary>
|
|
|
|
|
|
private static bool InTehran(double lat, double lng) => lat is >= 35.4 and <= 35.95 && lng is >= 51.0 and <= 51.8;
|
|
|
|
|
|
|
2026-06-21 13:29:43 +03:30
|
|
|
|
// Markers that mean a doctor role is a SPECIALIST, not a GP — used to correct a «پزشک عمومی»
|
|
|
|
|
|
// mislabel on a clearly-specialist ad (e.g. an ENT post showing as «استخدام پزشک عمومی»).
|
|
|
|
|
|
private static readonly string[] SpecialistMarkers =
|
|
|
|
|
|
{ "متخصص", "فوق تخصص", "فوقتخصص", "فلوشیپ", "فلوشیب", "بورد تخصصی", "ساب اسپشالیتی", "ent" };
|
|
|
|
|
|
|
|
|
|
|
|
private static bool LooksSpecialist(string? rawText)
|
|
|
|
|
|
{
|
|
|
|
|
|
var t = NormalizeFa(rawText);
|
|
|
|
|
|
return SpecialistMarkers.Any(m => t.Contains(NormalizeFa(m)));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-20 15:41:06 +03:30
|
|
|
|
// Gender/seniority tokens that don't belong in a role name (they go to tags / the Gender field).
|
|
|
|
|
|
private static readonly string[] RoleModifierWords =
|
|
|
|
|
|
{ "آقا", "خانم", "خانوم", "بانو", "مرد", "زن", "کارآموز", "کارورز", "ارشد", "مبتدی" };
|
|
|
|
|
|
|
2026-06-20 19:58:06 +03:30
|
|
|
|
// Availability phrases that the model sometimes glues onto the role («کمک بهیار آماده به کار»);
|
|
|
|
|
|
// removed as whole substrings before token-stripping (so «به»/«کار» tokens stay safe elsewhere).
|
|
|
|
|
|
private static readonly string[] RolePhraseNoise =
|
|
|
|
|
|
{ "آماده به کار", "آماده همکاری", "آماده بکار", "آماده به همکاری", "جویای کار", "دنبال کار", "جهت همکاری" };
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>Remove availability phrases + gender/seniority tokens from a role name, keeping the
|
|
|
|
|
|
/// base profession. Never strips to empty (falls back to the original).</summary>
|
2026-06-20 15:41:06 +03:30
|
|
|
|
private static string StripRoleModifiers(string name)
|
|
|
|
|
|
{
|
2026-06-20 19:58:06 +03:30
|
|
|
|
var n = NormalizeFa(name);
|
|
|
|
|
|
foreach (var p in RolePhraseNoise) n = n.Replace(NormalizeFa(p), " ");
|
|
|
|
|
|
var kept = n.Split(' ', StringSplitOptions.RemoveEmptyEntries)
|
2026-06-20 15:41:06 +03:30
|
|
|
|
.Where(t => !RoleModifierWords.Any(m => NormalizeFa(m) == t)).ToList();
|
|
|
|
|
|
return kept.Count > 0 ? string.Join(" ", kept) : name.Trim();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-08 11:10:19 +03:30
|
|
|
|
/// <summary>Fresh ContactMethod rows for one talent listing (parser contacts + AI phone).</summary>
|
|
|
|
|
|
private static List<ContactMethod> BuildContacts(AiStructured? d, ParsedListing parsed)
|
|
|
|
|
|
{
|
|
|
|
|
|
var contacts = parsed.Contacts
|
|
|
|
|
|
.Select((c, i) => new ContactMethod { Type = c.Type, Value = c.Value, SortOrder = i })
|
|
|
|
|
|
.ToList();
|
|
|
|
|
|
if (!string.IsNullOrWhiteSpace(d?.Phone)
|
|
|
|
|
|
&& !contacts.Any(c => c.Type is ContactType.Mobile or ContactType.Phone))
|
|
|
|
|
|
contacts.Insert(0, new ContactMethod { Type = ContactType.Mobile, Value = d!.Phone!.Trim(), SortOrder = -1 });
|
|
|
|
|
|
return contacts;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-03 17:41:02 +03:30
|
|
|
|
private static ShiftType MapShiftType(string? ai, ShiftType? parsed) => (ai?.ToLowerInvariant()) switch
|
|
|
|
|
|
{
|
|
|
|
|
|
"day" => ShiftType.Day, "evening" => ShiftType.Evening, "night" => ShiftType.Night, "oncall" => ShiftType.OnCall,
|
|
|
|
|
|
_ => parsed ?? ShiftType.Day,
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
private static EmploymentType MapEmployment(string? ai, EmploymentType? parsed) => (ai?.ToLowerInvariant()) switch
|
|
|
|
|
|
{
|
|
|
|
|
|
"parttime" => EmploymentType.PartTime, "contract" => EmploymentType.Contract,
|
|
|
|
|
|
"plan" => EmploymentType.Plan, "fulltime" => EmploymentType.FullTime,
|
|
|
|
|
|
_ => parsed ?? EmploymentType.FullTime,
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
private static (TimeOnly, TimeOnly) DefaultTimes(ShiftType t) => t switch
|
|
|
|
|
|
{
|
|
|
|
|
|
ShiftType.Day => (new TimeOnly(8, 0), new TimeOnly(14, 0)),
|
|
|
|
|
|
ShiftType.Evening => (new TimeOnly(14, 0), new TimeOnly(20, 0)),
|
|
|
|
|
|
ShiftType.Night => (new TimeOnly(20, 0), new TimeOnly(8, 0)),
|
|
|
|
|
|
_ => (new TimeOnly(8, 0), new TimeOnly(8, 0)),
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
private static string? Join(string a, string? b) => string.IsNullOrEmpty(b) ? a : $"{a} | {b}";
|
|
|
|
|
|
|
2026-06-03 08:18:19 +03:30
|
|
|
|
private static string Hash(string text)
|
|
|
|
|
|
{
|
|
|
|
|
|
var normalized = Regex.Replace((text ?? "").Trim(), @"\s+", " ");
|
2026-06-03 17:41:02 +03:30
|
|
|
|
return Convert.ToHexString(SHA256.HashData(Encoding.UTF8.GetBytes(normalized))).ToLowerInvariant();
|
2026-06-03 08:18:19 +03:30
|
|
|
|
}
|
2026-06-10 21:28:12 +03:30
|
|
|
|
|
|
|
|
|
|
/// <summary>Age of a post in whole days — from the source's real timestamp when present, else a
|
|
|
|
|
|
/// Persian "time ago" phrase in the text (Divar). Null when neither is available (= unknown age,
|
|
|
|
|
|
/// so it's NOT filtered out).</summary>
|
|
|
|
|
|
private static int? PostAgeDays(ScrapedItem item)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (item.PostedAt is DateTime posted)
|
|
|
|
|
|
return Math.Max(0, (int)Math.Floor((DateTime.UtcNow - posted).TotalDays));
|
|
|
|
|
|
return HtmlUtil.AgeDaysFromPersianText(item.RawText);
|
|
|
|
|
|
}
|
2026-06-03 08:18:19 +03:30
|
|
|
|
}
|