Ingestion data-quality + map fixes: AI salary, geocode coverage, in-place backfill & purge
CI/CD / CI · dotnet build (push) Successful in 30s
CI/CD / Deploy · hamkadr (push) Successful in 1m11s

- Jobs now keep the AI-extracted salary (d.PayAmount ?? parsed.PayAmount); they
  previously used only the parser figure, so every aggregated opening showed «توافقی».
- Geocoder also scans the ad body, so Tehran ads that name a neighbourhood only in
  free text («… در سهروردی») get an approximate map point.
- New BackfillCoordsAsync (+ admin button): fills missing coords on existing aggregated
  listings from their stored text, in place — no ID/URL churn, SEO-safe.
- New PurgeInvalidAggregatedAsync + DedupeJobsAsync (+ admin button): in-place removal of
  out-of-scope (domestic/promo/spam) aggregated jobs/shifts and duplicate job reposts,
  keeping valid listings' IDs.
- Jobs detail page always renders the location card (matches Shifts) instead of hiding it
  when coords are missing.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-21 05:09:39 +03:30
parent a16a805869
commit e2011d335e
4 changed files with 173 additions and 10 deletions
@@ -58,6 +58,24 @@
توصیه‌شده برای پاک‌سازیِ آماده‌به‌کارها: متنِ خام نگه داشته می‌شود و فقط با منطقِ جدید (یک‌نفر=یک‌آگهی، نقش پایه، گروه ثابت، تگ تمیز، موقعیت تقریبی) بازساخته می‌شوند. صفحاتِ «آماده به کار» ایندکس نمی‌شوند، پس آدرسِ ایندکس‌شده‌ای تغییر نمی‌کند؛ شیفت/استخدام به‌مرور با ایمیجستِ تازه پاک می‌شوند.
</p>
<form method="post" onsubmit="return confirm('برای آگهی‌های جمع‌آوری‌شدهٔ تهران که موقعیت روی نقشه ندارند، از روی متنِ آگهی محلهٔ تقریبی پیدا و مختصات تنظیم می‌شود. شناسه و آدرس صفحات تغییر نمی‌کند (امن برای SEO). ادامه؟');">
<button type="submit" asp-page-handler="BackfillCoords" class="btn btn-primary btn-block" style="margin-top:10px;">
📍 تکمیل موقعیتِ نقشه برای آگهی‌های موجود
</button>
</form>
<p class="muted" style="font-size:11px; margin:6px 0 0;">
شیفت/استخدام/آماده‌به‌کارِ جمع‌آوری‌شده‌ای که مختصات ندارند، از روی محلهٔ ذکرشده در متنِ آگهی روی نقشه قرار می‌گیرند (محدودهٔ تقریبی). فقط مختصاتِ خالی پر می‌شود؛ موقعیتِ واقعیِ مراکز دست‌نخورده می‌ماند.
</p>
<form method="post" onsubmit="return confirm('آگهی‌های جمع‌آوری‌شدهٔ شیفت/استخدام که اکنون خارج از حوزه‌اند (خدمات منزل/نظافت، تبلیغاتی/آموزشی، اسپم) و استخدام‌های تکراری حذف می‌شوند. آگهی‌های معتبر و شناسه/آدرسشان دست‌نخورده می‌ماند. این کار بازگشت‌ناپذیر است. ادامه؟');">
<button type="submit" asp-page-handler="PurgeInvalid" class="btn btn-outline btn-block" style="margin-top:10px; color:var(--danger); border-color:var(--danger);">
🧽 حذفِ درجای آگهی‌های خارج از حوزه و تکراری (شیفت/استخدام)
</button>
</form>
<p class="muted" style="font-size:11px; margin:6px 0 0;">
فقط آگهی‌هایی که با صافیِ فعلی «خارج از حوزه» تشخیص داده می‌شوند (نه صرفاً ناقص) و استخدام‌های تکراری پاک می‌شوند. آگهی‌های معتبر دست‌نخورده‌اند، پس آدرسِ ایندکس‌شده‌شان تغییر نمی‌کند؛ فقط صفحاتِ بد ۴۰۴ می‌شوند.
</p>
<hr style="border:none; border-top:1px solid var(--line); margin:16px 0;" />
<h3>افزودن دستی</h3>
@@ -120,6 +120,30 @@ public class IndexModel : PageModel
return RedirectToPage();
}
/// <summary>
/// Fill missing map coordinates on existing aggregated Tehran listings from their stored ad text
/// (TehranGeo). In place — no AI calls, no re-fetch, and crucially no delete/recreate, so indexed
/// shift/job URLs keep their IDs. Fast (pure DB + string matching), so it runs inline.
/// </summary>
public async Task<IActionResult> OnPostBackfillCoordsAsync()
{
var n = await _ingest.BackfillCoordsAsync();
IngestMessage = $"مختصات تقریبی برای {n} آگهی جمع‌آوری‌شده از روی متن آگهی تکمیل شد (بدون تغییر شناسه یا آدرس صفحه).";
return RedirectToPage();
}
/// <summary>
/// In-place cleanup of existing aggregated jobs/shifts: delete only the out-of-scope ones
/// (domestic-helper / promotional / spam) per the current validator, plus near-duplicate job
/// reposts. Valid listings keep their IDs/URLs. No re-fetch, no AI — runs inline.
/// </summary>
public async Task<IActionResult> OnPostPurgeInvalidAsync()
{
var (removed, deduped) = await _ingest.PurgeInvalidAggregatedAsync();
IngestMessage = $"پاک‌سازیِ درجا: {removed} آگهیِ خارج از حوزه (خدمات منزل/تبلیغاتی/اسپم) و {deduped} استخدامِ تکراری حذف شد. سایر آگهی‌ها و شناسه/آدرسشان دست‌نخورده ماند.";
return RedirectToPage();
}
private async Task LoadAsync()
{
Queue = await _db.RawListings
+12 -8
View File
@@ -161,12 +161,12 @@
}
</div>
@if (mapLat is not null && mapLng is not null)
{
var latS = mapLat.Value.ToString(System.Globalization.CultureInfo.InvariantCulture);
var lngS = mapLng.Value.ToString(System.Globalization.CultureInfo.InvariantCulture);
<div class="card card-pad" style="margin-top:16px;">
<h3 style="margin-top:0;">موقعیت مکانی</h3>
<div class="card card-pad" style="margin-top:16px;">
<h3 style="margin-top:0;">موقعیت مکانی</h3>
@if (mapLat is not null && mapLng is not null)
{
var latS = mapLat.Value.ToString(System.Globalization.CultureInfo.InvariantCulture);
var lngS = mapLng.Value.ToString(System.Globalization.CultureInfo.InvariantCulture);
@if (!string.IsNullOrEmpty(Model.MapKey))
{
<div id="facmap" data-lat="@latS" data-lng="@lngS" data-approx="@(mapApprox ? "true" : "false")" style="height:200px; border-radius:10px; overflow:hidden; border:1px solid var(--line);"></div>
@@ -183,8 +183,12 @@
}
<a class="btn btn-outline btn-block" style="margin-top:8px;" target="_blank" rel="noopener"
href="https://neshan.org/maps/@(latS),@(lngS),16z">مسیریابی در نشان</a>
</div>
}
}
else
{
<p class="muted" style="margin:0;">مختصات این آگهی ثبت نشده است.</p>
}
</div>
</aside>
</div>
</div>
@@ -299,6 +299,117 @@ public class IngestionService
return removed;
}
/// <summary>
/// In-place geocoding backfill: for existing AGGREGATED listings in Tehran that still have no map
/// coords, derive an APPROXIMATE neighbourhood center from the stored ad text (TehranGeo) and fill
/// Lat/Lng. Unlike <see cref="ReprocessAsync"/> it never deletes or recreates rows, so listing IDs —
/// and the indexed shift/job URLs in the sitemap — are untouched; safe to run on the live board.
/// Only ever FILLS a null coordinate; a real point (Divar/employer/AI) is never overwritten.
/// Returns how many listings were newly placed on the map.
/// </summary>
public async Task<int> BackfillCoordsAsync(CancellationToken ct = default)
{
var tehran = await _db.Cities.FirstOrDefaultAsync(c => c.Name == "تهران", ct);
if (tehran is null) return 0;
int filled = 0;
var jobs = await _db.JobOpenings
.Where(j => j.Lat == null && j.Source == ShiftSource.Aggregated && j.Facility.CityId == tehran.Id)
.ToListAsync(ct);
foreach (var j in jobs)
if (TehranGeo.Locate(j.Description) is { } g) { j.Lat = g.lat; j.Lng = g.lng; filled++; }
var shifts = await _db.Shifts
.Where(s => s.Lat == null && s.Source == ShiftSource.Aggregated && s.Facility.CityId == tehran.Id)
.ToListAsync(ct);
foreach (var s in shifts)
if (TehranGeo.Locate(s.Description) is { } g) { s.Lat = g.lat; s.Lng = g.lng; filled++; }
var talent = await _db.TalentListings
.Where(t => t.Lat == null && t.Source == ShiftSource.Aggregated && t.CityId == tehran.Id)
.ToListAsync(ct);
foreach (var t in talent)
if (TehranGeo.Locate(t.AreaNote, t.Description) is { } g) { t.Lat = g.lat; t.Lng = g.lng; filled++; }
if (filled > 0) await _db.SaveChangesAsync(ct);
_log.LogInformation("Coordinate backfill placed {N} aggregated listings on the map.", filled);
return filled;
}
/// <summary>
/// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each listing's
/// stored text through the CURRENT validator and delete only the ones that are now clearly
/// out-of-scope — domestic-helper («امور منزل»), promotional/training, or spam (i.e.
/// <see cref="ValidationResult.IsSpam"/>). Merely-incomplete-but-legit ads are KEPT. Then collapse
/// near-duplicate job reposts. Valid listings are never touched, so their IDs — and indexed URLs —
/// stay stable; only the bad pages 404 (which is the desired outcome). Returns (removed, deduped).
/// </summary>
public async Task<(int removed, int deduped)> PurgeInvalidAggregatedAsync(CancellationToken ct = default)
{
var roleNames = await _db.Roles.Select(r => r.Name).ToListAsync(ct);
var cityNames = await _db.Cities.Select(c => c.Name).ToListAsync(ct);
var districtNames = await _db.Districts.Select(d => d.Name).ToListAsync(ct);
bool IsOutOfScope(string? text)
{
var t = text ?? "";
var parsed = _parser.Parse(t, roleNames, cityNames, districtNames);
return _validator.Validate(t, parsed).IsSpam; // spam | promo | domestic-helper
}
int removed = 0;
var jobIds = (await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated)
.Select(j => new { j.Id, j.Description }).ToListAsync(ct))
.Where(j => IsOutOfScope(j.Description)).Select(j => j.Id).ToList();
if (jobIds.Count > 0)
removed += await _db.JobOpenings.Where(j => jobIds.Contains(j.Id)).ExecuteDeleteAsync(ct);
var shiftIds = (await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated)
.Select(s => new { s.Id, s.Description }).ToListAsync(ct))
.Where(s => IsOutOfScope(s.Description)).Select(s => s.Id).ToList();
if (shiftIds.Count > 0)
removed += await _db.Shifts.Where(s => shiftIds.Contains(s.Id)).ExecuteDeleteAsync(ct);
var deduped = await DedupeJobsAsync(ct);
_log.LogInformation("Purge removed {R} out-of-scope aggregated listings; deduped {D} jobs.", removed, deduped);
return (removed, deduped);
}
/// <summary>
/// Collapse near-duplicate aggregated JOB reposts the exact-hash dedupe missed (same ad re-crawled
/// with slightly different surrounding text → different ContentHash). Signature = role + facility +
/// normalized description core with digits/«… پیش» time-phrases stripped. Keeps the NEWEST of each
/// group. Per-role fan-out of one ad is preserved (different RoleId → different signature).
/// </summary>
public async Task<int> DedupeJobsAsync(CancellationToken ct = default)
{
var rows = await _db.JobOpenings
.Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated)
.Select(j => new { j.Id, j.RoleId, j.FacilityId, j.Description, j.CreatedAt })
.ToListAsync(ct);
string? Sig(int roleId, int facId, string? desc)
{
var core = NormalizeFa(Regex.Replace(desc ?? "",
@"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim();
if (core.Length < 15) return null; // too little to call it a dup safely
return $"j:{roleId}:{facId}:{(core.Length > 120 ? core[..120] : core)}";
}
var toRemove = rows
.Select(r => new { r.Id, r.CreatedAt, Key = Sig(r.RoleId, r.FacilityId, r.Description) })
.Where(x => x.Key is not null)
.GroupBy(x => x.Key)
.SelectMany(g => g.OrderByDescending(x => x.CreatedAt).Skip(1).Select(x => x.Id))
.ToList();
if (toRemove.Count == 0) return 0;
var removed = await _db.JobOpenings.Where(j => toRemove.Contains(j.Id)).ExecuteDeleteAsync(ct);
_log.LogInformation("Deduped {N} near-duplicate aggregated jobs.", removed);
return removed;
}
private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray());
private static (RawListingStatus status, string? reason, int confidence) Decide(
@@ -366,8 +477,11 @@ public class IngestionService
// Tehran ad that only NAMES a neighborhood (Medjobs/Telegram), geocode that name to a rough
// center. Shown as a «محدودهٔ تقریبی» circle, never a precise pin.
double? appLat = raw.Lat, appLng = raw.Lng;
// Geocode from the structured location fields first, then fall back to scanning the ad body
// itself — many Tehran ads name the neighbourhood only in free text («… نیم ساعت پیش در سهروردی»)
// and never populate a district/area field, which is why most aggregated listings had no map.
if (appLat is null && city.Name == "تهران"
&& TehranGeo.Locate(district?.Name, districtName, parsed.AreaNote) is { } g)
&& TehranGeo.Locate(district?.Name, districtName, parsed.AreaNote, raw.RawText) is { } g)
{ appLat = g.lat; appLng = g.lng; }
// Last resort — the AI model's inferred coords, but ONLY when they fall inside greater Tehran
// (rejects a hallucinated point elsewhere). Uses the registered model where the rules can't decide.
@@ -446,7 +560,10 @@ public class IngestionService
Facility = facility, Role = role,
Title = !string.IsNullOrWhiteSpace(d?.Title) && pubRoles.Count == 1 ? d!.Title!.Trim() : $"استخدام {role.Name}",
EmploymentType = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
SalaryMin = parsed.PayAmount,
// Prefer the AI-extracted salary, falling back to the parser's — matching the talent
// path. (Jobs previously used only parsed.PayAmount, silently dropping the AI figure,
// so every aggregated opening showed «توافقی» even when the ad stated a number.)
SalaryMin = d?.PayAmount ?? parsed.PayAmount,
Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated,
SourceUrl = raw.SourceUrl,
Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center