AI qualify: de-dupe applicants, base roles, closed categories, tag hygiene + reprocess-stored action
CI/CD / CI · dotnet build (push) Successful in 2m35s
CI/CD / Deploy · hamkadr (push) Successful in 1m23s

Qualified live applicants and found three problems, all fixed:
- Duplicate cards: one ad fanned out into «پرستار» + «پرستار کودک» (same person).
  Applicants now publish ONE listing (no role fan-out); secondary roles → tags.
- Role sprawl: modifiers became roles. Prompt now returns the BASE profession
  and pushes age-group/ward/seniority to tags; new roles only for a genuinely
  new base profession (تکنسین داروخانه ✓, پرستار کودک ✗).
- Tag/category noise: categories pinned to the 5 fixed groups (+سایر, never
  invented); BuildTags drops pay/contact/location/fragment words.

Reprocess action: IngestionService.ReprocessAsync re-runs the current pipeline
over every stored RawListing WITHOUT re-fetching (keeps the raw text, so nothing
is lost to sources only exposing recent posts), deleting the old aggregated
posts and republishing cleanly. Admin dashboard button «پردازش مجددِ آیتم‌های
ذخیره‌شده» runs it on a background scope; result lands in the run-log.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-20 14:24:20 +03:30
parent 4c0b29addf
commit d62929ca0d
5 changed files with 182 additions and 48 deletions
@@ -48,9 +48,9 @@ public class OpenAiCompatibleAuditor : IAiAuditor
confidence: عدد ۰ تا ۱۰۰
reason: توضیح کوتاه فارسی
kind: shift (شیفت توسط مرکز) | job (استخدام توسط مرکز) | talent (کادر درمان که خودش «آماده به کار» است)
role: عنوان دقیق نقش درمانی (مثل پرستار، پزشک عمومی، دندانپزشک، تکنسین اتاق عمل، ماما، کارشناس آزمایشگاه). اگر تخصص دقیق در فهرست نبود، همان عنوان دقیق را برگردان.
category: گروه نقش (پزشک | پرستار | ماما | تکنسین | دندانپزشک). اگر هیچکدام مناسب نبود، یک گروه کوتاه و مناسب پیشنهاد بده.
tags: آرایهای از کلیدواژههای مهارت/الزام مرتبط بهصورت رشته (مثل "ICU"، "MMT"، "CPR"، "پروانه‌دار"، "خانم") یا []
role: «حرفهٔ پایه»، نه با توصیفگر. گروه سنی/بخش/سطح را در tags بگذار («پرستار کودک»role «پرستار»). فقط برای حرفهٔ پایهٔ متفاوت که در فهرست نیست نقش جدید بساز.
category: فقط یکی از این پنج: پزشک | پرستار | ماما | تکنسین | دندانپزشک. اگر نگنجید «سایر». هرگز گروه جدید نساز.
tags: آرایهٔ کلیدواژههای بالینی (مهارت/بخش/گواهی/گروه سنی/سطح) مثل "ICU"،"دیالیز"،"کودک"،"پروانه‌دار". بدون مبلغ/پرداخت/تماس/شهر یا جملهٔ ناقص. اگر نبود [].
city, district: نام شهر و محله/منطقه در صورت ذکر
shiftType: day|evening|night|oncall (فقط برای shift)
employmentType: fulltime|parttime|contract|plan
@@ -168,6 +168,84 @@ public class IngestionService
return summary;
}
/// <summary>
/// Re-run the CURRENT parser/AI/publish pipeline over every already-crawled RawListing, WITHOUT
/// re-fetching from sources. Use this after improving the pipeline to clean up existing aggregated
/// content (de-dupe, fix roles/categories/tags) — unlike <see cref="RunAsync"/> + the purge-cache
/// flow, it keeps every raw text, so nothing is lost to sources only exposing recent posts.
/// Deletes the old aggregated posts, then republishes from the stored raw text. Long-running
/// (one AI call per item) — call it on a background scope, not inside a request.
/// </summary>
public async Task<IngestionSummary> ReprocessAsync(CancellationToken ct = default)
{
var settings = await _settings.GetAsync();
var roles = await _db.Roles.ToListAsync(ct);
var cities = await _db.Cities.ToListAsync(ct);
var districts = await _db.Districts.ToListAsync(ct);
var facilities = await _db.Facilities.ToListAsync(ct); // reused (not deleted) → no facility churn
var roleNames = roles.Select(r => r.Name).ToList();
var cityNames = cities.Select(c => c.Name).ToList();
var districtNames = districts.Select(d => d.Name).ToList();
// Drop previously-published aggregated content; it's regenerated below from the raw text.
// DB cascade clears their ContactMethods/Applications/InterestEvents; RawListing back-refs SetNull.
await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct);
await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct);
await _db.TalentListings.Where(t => t.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct);
int fetched = 0, queued = 0, published = 0, flagged = 0, spam = 0;
var raws = await _db.RawListings.OrderBy(r => r.Id).ToListAsync(ct);
foreach (var raw in raws)
{
ct.ThrowIfCancellationRequested();
fetched++;
raw.LinkedShiftId = null; raw.LinkedTalentId = null; // old links were just deleted
var parsed = _parser.Parse(raw.RawText, roleNames, cityNames, districtNames);
var val = _validator.Validate(raw.RawText, parsed);
// Stale-applicant filter — age from the Persian "time ago" phrase in the text (Divar).
if (parsed.Kind == ListingKind.Talent
&& HtmlUtil.AgeDaysFromPersianText(raw.RawText) is int age && age > TalentMaxAgeDays)
{
raw.Status = RawListingStatus.Discarded; raw.Confidence = 0;
raw.ValidationNotes = $"آماده‌به‌کارِ قدیمی ({age} روز) — نادیده گرفته شد";
spam++; continue;
}
AiAuditResult? ai = null;
if (settings.AiEnabled && !val.IsSpam)
ai = await _ai.AuditAsync(raw.RawText, settings, ct);
var (status, reason, confidence) = Decide(settings, val, ai);
raw.Status = status; raw.ValidationNotes = reason; raw.Confidence = confidence;
if (status == RawListingStatus.Normalized)
{
try { Publish(parsed, ai, raw, roles, cities, districts, facilities); published++; }
catch (Exception ex) { _log.LogWarning(ex, "Reprocess publish failed; queueing"); raw.Status = RawListingStatus.New; queued++; }
}
else if (status == RawListingStatus.New) queued++;
else if (status == RawListingStatus.Flagged) flagged++;
else spam++;
if (fetched % 50 == 0) await _db.SaveChangesAsync(ct); // incremental progress on long runs
}
await _db.SaveChangesAsync(ct);
_db.IngestionRuns.Add(new IngestionRun
{
Fetched = fetched, Queued = queued, Published = published, Flagged = flagged, Spam = spam, Duplicates = 0,
Detail = $"پردازش مجدد آیتم‌های ذخیره‌شده — {fetched} آیتم: {published} منتشر، {queued} صف، {flagged} پرچم، {spam} ردشده/قدیمی",
});
await _db.SaveChangesAsync(ct);
_log.LogInformation("Reprocess done: items={F} published={P} queued={Q} flagged={Fl} discarded={S}",
fetched, published, queued, flagged, spam);
return new IngestionSummary(new List<SourceResult>
{ new("پردازش مجدد", fetched, queued, published, flagged, spam, 0) });
}
private static (RawListingStatus status, string? reason, int confidence) Decide(
AppSetting s, ValidationResult val, AiAuditResult? ai)
{
@@ -234,28 +312,31 @@ public class IngestionService
// «آماده به کار» — a worker offering themselves. No facility involved.
if (parsed.Kind == ListingKind.Talent || kindStr.Contains("talent") || kindStr.Contains("آماده"))
{
// Prefer the AI's tags when present, else the heuristic parser.
// ONE person = ONE listing. Do NOT fan out across roles: an applicant has a single
// profession, and «پرستار» + «پرستار کودک» from the same ad were producing duplicate
// cards. Use the primary (AI) role; any secondary role names become searchable tags.
var role = pubRoles[0];
var extraRoleTags = pubRoles.Skip(1).Select(r => r.Name);
var tPay = d?.PayAmount ?? parsed.PayAmount;
var tShare = d?.SharePercent ?? parsed.SharePercent;
foreach (var role in pubRoles)
_db.TalentListings.Add(new TalentListing
{
Role = role, City = city, DistrictId = district?.Id,
PersonName = !string.IsNullOrWhiteSpace(d?.PersonName) ? d!.PersonName!.Trim() : parsed.PersonName,
YearsExperience = d?.YearsExperience ?? parsed.YearsExperience,
IsLicensed = d?.IsLicensed ?? parsed.IsLicensed,
AreaNote = parsed.AreaNote,
Availability = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
Gender = parsed.Gender,
PayType = tShare is not null && tPay is null ? PayType.Percentage
: tPay is null ? PayType.Negotiable : PayType.PerShift,
PayAmount = tPay, SharePercent = tShare,
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone,
Description = raw.RawText,
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
Contacts = BuildContacts(d, parsed), // fresh instances per listing
Tags = BuildTags(parsed, d, role, city),
});
_db.TalentListings.Add(new TalentListing
{
Role = role, City = city, DistrictId = district?.Id,
PersonName = !string.IsNullOrWhiteSpace(d?.PersonName) ? d!.PersonName!.Trim() : parsed.PersonName,
YearsExperience = d?.YearsExperience ?? parsed.YearsExperience,
IsLicensed = d?.IsLicensed ?? parsed.IsLicensed,
AreaNote = parsed.AreaNote,
Availability = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
Gender = parsed.Gender,
PayType = tShare is not null && tPay is null ? PayType.Percentage
: tPay is null ? PayType.Negotiable : PayType.PerShift,
PayAmount = tPay, SharePercent = tShare,
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone,
Description = raw.RawText,
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
Contacts = BuildContacts(d, parsed),
Tags = BuildTags(parsed, d, role, city, extraRoleTags),
});
raw.Status = RawListingStatus.Normalized;
return;
}
@@ -325,13 +406,34 @@ public class IngestionService
}
/// <summary>Space-separated searchable tags: parsed cert/skill tags + AI-detected skills/requirements
/// + this listing's role/category + city. Drives deep search and tag chips on the applicant card.</summary>
private static string BuildTags(ParsedListing parsed, AiStructured? d, Role role, City city)
/// + secondary role names + this listing's role/category + city. Pay/contact/location noise and
/// sentence fragments are filtered out so chips stay clinical. Drives deep search + tag chips.</summary>
private static string BuildTags(ParsedListing parsed, AiStructured? d, Role role, City city,
IEnumerable<string>? extraRoles = null)
{
var tags = new List<string>(parsed.Tags) { role.Name, role.Category, city.Name };
if (extraRoles is not null) tags.AddRange(extraRoles);
if (d?.Tags is not null)
tags.AddRange(d.Tags.Where(t => !string.IsNullOrWhiteSpace(t)).Select(t => t.Trim()));
return string.Join(" ", tags.Where(t => !string.IsNullOrWhiteSpace(t)).Distinct());
return string.Join(" ", tags
.Where(t => !string.IsNullOrWhiteSpace(t) && !IsNoiseTag(t))
.Select(t => t.Trim())
.Distinct());
}
// Words/phrases that are NOT clinical skills — pay, contact, generic verbs, sentence fragments —
// that were polluting the tag chips («پرداخت توافقی»، «مراقبت از»…).
private static readonly string[] TagStopWords =
{
"توافقی", "پرداخت", "پرداخت توافقی", "حقوق", "دستمزد", "تماس", "شماره", "شماره تماس",
"مراقبت از", "مراقبت", "همکاری", "آماده", "آماده به کار", "نیرو", "استخدام", "جذب",
};
private static bool IsNoiseTag(string tag)
{
var t = NormalizeFa(tag);
if (t.Length < 2 || t.EndsWith(" از") || t.EndsWith("-از")) return true; // dangling «… از»
return TagStopWords.Any(w => NormalizeFa(w) == t);
}
/// <summary>Resolve a role name to an existing Role; if it's genuinely new, create it (dynamic
@@ -360,7 +462,7 @@ public class IngestionService
var created = new Role
{
Name = Clamp(name.Trim(), 100), // respect Role.Name MaxLength(100)
Category = Clamp(ResolveCategory(roles, category), 50), // respect Role.Category MaxLength(50)
Category = Clamp(ResolveCategory(category), 50), // closed set → respect MaxLength(50)
IsActive = true,
SortOrder = (roles.Count == 0 ? 0 : roles.Max(r => r.SortOrder)) + 1,
};
@@ -371,19 +473,12 @@ public class IngestionService
return created;
}
/// <summary>Map an AI-suggested category to a canonical one: synonym alias first
/// (پزشکی→پزشک، nursing→پرستار…), then any existing category that normalizes the same, else as-is.</summary>
private static string ResolveCategory(List<Role> roles, string? category)
{
var raw = string.IsNullOrWhiteSpace(category) ? "سایر" : category!.Trim();
// Resolve to a canonical first (synonym alias), then to whichever normalized form is the
// matching target. Crucially, ALWAYS prefer a category string already stored on a role — even
// after an alias maps to a canonical — so we never fork a second variant of the same group.
var target = CategoryAliases.TryGetValue(NormalizeFa(raw), out var canonical) ? canonical : raw;
var targetNorm = NormalizeFa(target);
return roles.Select(r => r.Category)
.FirstOrDefault(c => !string.IsNullOrWhiteSpace(c) && NormalizeFa(c) == targetNorm) ?? target;
}
/// <summary>Map an AI-suggested category to one of the FIXED groups (پزشک/پرستار/ماما/تکنسین/
/// دندانپزشک). Categories are a closed taxonomy — they drive the filter chips — so unlike roles
/// they are NEVER invented: a synonym resolves to its canonical group, anything else → «سایر».
/// (CategoryAliases maps each canonical group to itself, so exact matches resolve here too.)</summary>
private static string ResolveCategory(string? category)
=> CategoryAliases.TryGetValue(NormalizeFa(category), out var canonical) ? canonical : "سایر";
// Synonyms/abbreviations → canonical ROLE name, so the AI naming a role differently maps onto an
// existing role instead of forking the taxonomy. Keys are matched after NormalizeFa. Add freely.