AI qualify: de-dupe applicants, base roles, closed categories, tag hygiene + reprocess-stored action
Qualified live applicants and found three problems, all fixed: - Duplicate cards: one ad fanned out into «پرستار» + «پرستار کودک» (same person). Applicants now publish ONE listing (no role fan-out); secondary roles → tags. - Role sprawl: modifiers became roles. Prompt now returns the BASE profession and pushes age-group/ward/seniority to tags; new roles only for a genuinely new base profession (تکنسین داروخانه ✓, پرستار کودک ✗). - Tag/category noise: categories pinned to the 5 fixed groups (+سایر, never invented); BuildTags drops pay/contact/location/fragment words. Reprocess action: IngestionService.ReprocessAsync re-runs the current pipeline over every stored RawListing WITHOUT re-fetching (keeps the raw text, so nothing is lost to sources only exposing recent posts), deleting the old aggregated posts and republishing cleanly. Admin dashboard button «پردازش مجددِ آیتمهای ذخیرهشده» runs it on a background scope; result lands in the run-log. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -168,6 +168,84 @@ public class IngestionService
|
||||
return summary;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Re-run the CURRENT parser/AI/publish pipeline over every already-crawled RawListing, WITHOUT
|
||||
/// re-fetching from sources. Use this after improving the pipeline to clean up existing aggregated
|
||||
/// content (de-dupe, fix roles/categories/tags) — unlike <see cref="RunAsync"/> + the purge-cache
|
||||
/// flow, it keeps every raw text, so nothing is lost to sources only exposing recent posts.
|
||||
/// Deletes the old aggregated posts, then republishes from the stored raw text. Long-running
|
||||
/// (one AI call per item) — call it on a background scope, not inside a request.
|
||||
/// </summary>
|
||||
public async Task<IngestionSummary> ReprocessAsync(CancellationToken ct = default)
|
||||
{
|
||||
var settings = await _settings.GetAsync();
|
||||
var roles = await _db.Roles.ToListAsync(ct);
|
||||
var cities = await _db.Cities.ToListAsync(ct);
|
||||
var districts = await _db.Districts.ToListAsync(ct);
|
||||
var facilities = await _db.Facilities.ToListAsync(ct); // reused (not deleted) → no facility churn
|
||||
var roleNames = roles.Select(r => r.Name).ToList();
|
||||
var cityNames = cities.Select(c => c.Name).ToList();
|
||||
var districtNames = districts.Select(d => d.Name).ToList();
|
||||
|
||||
// Drop previously-published aggregated content; it's regenerated below from the raw text.
|
||||
// DB cascade clears their ContactMethods/Applications/InterestEvents; RawListing back-refs SetNull.
|
||||
await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct);
|
||||
await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct);
|
||||
await _db.TalentListings.Where(t => t.Source == ShiftSource.Aggregated).ExecuteDeleteAsync(ct);
|
||||
|
||||
int fetched = 0, queued = 0, published = 0, flagged = 0, spam = 0;
|
||||
var raws = await _db.RawListings.OrderBy(r => r.Id).ToListAsync(ct);
|
||||
foreach (var raw in raws)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
fetched++;
|
||||
raw.LinkedShiftId = null; raw.LinkedTalentId = null; // old links were just deleted
|
||||
|
||||
var parsed = _parser.Parse(raw.RawText, roleNames, cityNames, districtNames);
|
||||
var val = _validator.Validate(raw.RawText, parsed);
|
||||
|
||||
// Stale-applicant filter — age from the Persian "time ago" phrase in the text (Divar).
|
||||
if (parsed.Kind == ListingKind.Talent
|
||||
&& HtmlUtil.AgeDaysFromPersianText(raw.RawText) is int age && age > TalentMaxAgeDays)
|
||||
{
|
||||
raw.Status = RawListingStatus.Discarded; raw.Confidence = 0;
|
||||
raw.ValidationNotes = $"آمادهبهکارِ قدیمی ({age} روز) — نادیده گرفته شد";
|
||||
spam++; continue;
|
||||
}
|
||||
|
||||
AiAuditResult? ai = null;
|
||||
if (settings.AiEnabled && !val.IsSpam)
|
||||
ai = await _ai.AuditAsync(raw.RawText, settings, ct);
|
||||
|
||||
var (status, reason, confidence) = Decide(settings, val, ai);
|
||||
raw.Status = status; raw.ValidationNotes = reason; raw.Confidence = confidence;
|
||||
|
||||
if (status == RawListingStatus.Normalized)
|
||||
{
|
||||
try { Publish(parsed, ai, raw, roles, cities, districts, facilities); published++; }
|
||||
catch (Exception ex) { _log.LogWarning(ex, "Reprocess publish failed; queueing"); raw.Status = RawListingStatus.New; queued++; }
|
||||
}
|
||||
else if (status == RawListingStatus.New) queued++;
|
||||
else if (status == RawListingStatus.Flagged) flagged++;
|
||||
else spam++;
|
||||
|
||||
if (fetched % 50 == 0) await _db.SaveChangesAsync(ct); // incremental progress on long runs
|
||||
}
|
||||
await _db.SaveChangesAsync(ct);
|
||||
|
||||
_db.IngestionRuns.Add(new IngestionRun
|
||||
{
|
||||
Fetched = fetched, Queued = queued, Published = published, Flagged = flagged, Spam = spam, Duplicates = 0,
|
||||
Detail = $"پردازش مجدد آیتمهای ذخیرهشده — {fetched} آیتم: {published} منتشر، {queued} صف، {flagged} پرچم، {spam} ردشده/قدیمی",
|
||||
});
|
||||
await _db.SaveChangesAsync(ct);
|
||||
_log.LogInformation("Reprocess done: items={F} published={P} queued={Q} flagged={Fl} discarded={S}",
|
||||
fetched, published, queued, flagged, spam);
|
||||
|
||||
return new IngestionSummary(new List<SourceResult>
|
||||
{ new("پردازش مجدد", fetched, queued, published, flagged, spam, 0) });
|
||||
}
|
||||
|
||||
private static (RawListingStatus status, string? reason, int confidence) Decide(
|
||||
AppSetting s, ValidationResult val, AiAuditResult? ai)
|
||||
{
|
||||
@@ -234,28 +312,31 @@ public class IngestionService
|
||||
// «آماده به کار» — a worker offering themselves. No facility involved.
|
||||
if (parsed.Kind == ListingKind.Talent || kindStr.Contains("talent") || kindStr.Contains("آماده"))
|
||||
{
|
||||
// Prefer the AI's tags when present, else the heuristic parser.
|
||||
// ONE person = ONE listing. Do NOT fan out across roles: an applicant has a single
|
||||
// profession, and «پرستار» + «پرستار کودک» from the same ad were producing duplicate
|
||||
// cards. Use the primary (AI) role; any secondary role names become searchable tags.
|
||||
var role = pubRoles[0];
|
||||
var extraRoleTags = pubRoles.Skip(1).Select(r => r.Name);
|
||||
var tPay = d?.PayAmount ?? parsed.PayAmount;
|
||||
var tShare = d?.SharePercent ?? parsed.SharePercent;
|
||||
foreach (var role in pubRoles)
|
||||
_db.TalentListings.Add(new TalentListing
|
||||
{
|
||||
Role = role, City = city, DistrictId = district?.Id,
|
||||
PersonName = !string.IsNullOrWhiteSpace(d?.PersonName) ? d!.PersonName!.Trim() : parsed.PersonName,
|
||||
YearsExperience = d?.YearsExperience ?? parsed.YearsExperience,
|
||||
IsLicensed = d?.IsLicensed ?? parsed.IsLicensed,
|
||||
AreaNote = parsed.AreaNote,
|
||||
Availability = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
|
||||
Gender = parsed.Gender,
|
||||
PayType = tShare is not null && tPay is null ? PayType.Percentage
|
||||
: tPay is null ? PayType.Negotiable : PayType.PerShift,
|
||||
PayAmount = tPay, SharePercent = tShare,
|
||||
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone,
|
||||
Description = raw.RawText,
|
||||
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
|
||||
Contacts = BuildContacts(d, parsed), // fresh instances per listing
|
||||
Tags = BuildTags(parsed, d, role, city),
|
||||
});
|
||||
_db.TalentListings.Add(new TalentListing
|
||||
{
|
||||
Role = role, City = city, DistrictId = district?.Id,
|
||||
PersonName = !string.IsNullOrWhiteSpace(d?.PersonName) ? d!.PersonName!.Trim() : parsed.PersonName,
|
||||
YearsExperience = d?.YearsExperience ?? parsed.YearsExperience,
|
||||
IsLicensed = d?.IsLicensed ?? parsed.IsLicensed,
|
||||
AreaNote = parsed.AreaNote,
|
||||
Availability = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
|
||||
Gender = parsed.Gender,
|
||||
PayType = tShare is not null && tPay is null ? PayType.Percentage
|
||||
: tPay is null ? PayType.Negotiable : PayType.PerShift,
|
||||
PayAmount = tPay, SharePercent = tShare,
|
||||
Phone = !string.IsNullOrWhiteSpace(d?.Phone) ? d!.Phone!.Trim() : parsed.Phone,
|
||||
Description = raw.RawText,
|
||||
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
|
||||
Contacts = BuildContacts(d, parsed),
|
||||
Tags = BuildTags(parsed, d, role, city, extraRoleTags),
|
||||
});
|
||||
raw.Status = RawListingStatus.Normalized;
|
||||
return;
|
||||
}
|
||||
@@ -325,13 +406,34 @@ public class IngestionService
|
||||
}
|
||||
|
||||
/// <summary>Space-separated searchable tags: parsed cert/skill tags + AI-detected skills/requirements
|
||||
/// + this listing's role/category + city. Drives deep search and tag chips on the applicant card.</summary>
|
||||
private static string BuildTags(ParsedListing parsed, AiStructured? d, Role role, City city)
|
||||
/// + secondary role names + this listing's role/category + city. Pay/contact/location noise and
|
||||
/// sentence fragments are filtered out so chips stay clinical. Drives deep search + tag chips.</summary>
|
||||
private static string BuildTags(ParsedListing parsed, AiStructured? d, Role role, City city,
|
||||
IEnumerable<string>? extraRoles = null)
|
||||
{
|
||||
var tags = new List<string>(parsed.Tags) { role.Name, role.Category, city.Name };
|
||||
if (extraRoles is not null) tags.AddRange(extraRoles);
|
||||
if (d?.Tags is not null)
|
||||
tags.AddRange(d.Tags.Where(t => !string.IsNullOrWhiteSpace(t)).Select(t => t.Trim()));
|
||||
return string.Join(" ", tags.Where(t => !string.IsNullOrWhiteSpace(t)).Distinct());
|
||||
return string.Join(" ", tags
|
||||
.Where(t => !string.IsNullOrWhiteSpace(t) && !IsNoiseTag(t))
|
||||
.Select(t => t.Trim())
|
||||
.Distinct());
|
||||
}
|
||||
|
||||
// Words/phrases that are NOT clinical skills — pay, contact, generic verbs, sentence fragments —
|
||||
// that were polluting the tag chips («پرداخت توافقی»، «مراقبت از»…).
|
||||
private static readonly string[] TagStopWords =
|
||||
{
|
||||
"توافقی", "پرداخت", "پرداخت توافقی", "حقوق", "دستمزد", "تماس", "شماره", "شماره تماس",
|
||||
"مراقبت از", "مراقبت", "همکاری", "آماده", "آماده به کار", "نیرو", "استخدام", "جذب",
|
||||
};
|
||||
|
||||
private static bool IsNoiseTag(string tag)
|
||||
{
|
||||
var t = NormalizeFa(tag);
|
||||
if (t.Length < 2 || t.EndsWith(" از") || t.EndsWith("-از")) return true; // dangling «… از»
|
||||
return TagStopWords.Any(w => NormalizeFa(w) == t);
|
||||
}
|
||||
|
||||
/// <summary>Resolve a role name to an existing Role; if it's genuinely new, create it (dynamic
|
||||
@@ -360,7 +462,7 @@ public class IngestionService
|
||||
var created = new Role
|
||||
{
|
||||
Name = Clamp(name.Trim(), 100), // respect Role.Name MaxLength(100)
|
||||
Category = Clamp(ResolveCategory(roles, category), 50), // respect Role.Category MaxLength(50)
|
||||
Category = Clamp(ResolveCategory(category), 50), // closed set → respect MaxLength(50)
|
||||
IsActive = true,
|
||||
SortOrder = (roles.Count == 0 ? 0 : roles.Max(r => r.SortOrder)) + 1,
|
||||
};
|
||||
@@ -371,19 +473,12 @@ public class IngestionService
|
||||
return created;
|
||||
}
|
||||
|
||||
/// <summary>Map an AI-suggested category to a canonical one: synonym alias first
|
||||
/// (پزشکی→پزشک، nursing→پرستار…), then any existing category that normalizes the same, else as-is.</summary>
|
||||
private static string ResolveCategory(List<Role> roles, string? category)
|
||||
{
|
||||
var raw = string.IsNullOrWhiteSpace(category) ? "سایر" : category!.Trim();
|
||||
// Resolve to a canonical first (synonym alias), then to whichever normalized form is the
|
||||
// matching target. Crucially, ALWAYS prefer a category string already stored on a role — even
|
||||
// after an alias maps to a canonical — so we never fork a second variant of the same group.
|
||||
var target = CategoryAliases.TryGetValue(NormalizeFa(raw), out var canonical) ? canonical : raw;
|
||||
var targetNorm = NormalizeFa(target);
|
||||
return roles.Select(r => r.Category)
|
||||
.FirstOrDefault(c => !string.IsNullOrWhiteSpace(c) && NormalizeFa(c) == targetNorm) ?? target;
|
||||
}
|
||||
/// <summary>Map an AI-suggested category to one of the FIXED groups (پزشک/پرستار/ماما/تکنسین/
|
||||
/// دندانپزشک). Categories are a closed taxonomy — they drive the filter chips — so unlike roles
|
||||
/// they are NEVER invented: a synonym resolves to its canonical group, anything else → «سایر».
|
||||
/// (CategoryAliases maps each canonical group to itself, so exact matches resolve here too.)</summary>
|
||||
private static string ResolveCategory(string? category)
|
||||
=> CategoryAliases.TryGetValue(NormalizeFa(category), out var canonical) ? canonical : "سایر";
|
||||
|
||||
// Synonyms/abbreviations → canonical ROLE name, so the AI naming a role differently maps onto an
|
||||
// existing role instead of forking the taxonomy. Keys are matched after NormalizeFa. Add freely.
|
||||
|
||||
Reference in New Issue
Block a user