Ingestion data-quality + map fixes: AI salary, geocode coverage, in-place backfill & purge
- Jobs now keep the AI-extracted salary (d.PayAmount ?? parsed.PayAmount); they previously used only the parser figure, so every aggregated opening showed «توافقی». - Geocoder also scans the ad body, so Tehran ads that name a neighbourhood only in free text («… در سهروردی») get an approximate map point. - New BackfillCoordsAsync (+ admin button): fills missing coords on existing aggregated listings from their stored text, in place — no ID/URL churn, SEO-safe. - New PurgeInvalidAggregatedAsync + DedupeJobsAsync (+ admin button): in-place removal of out-of-scope (domestic/promo/spam) aggregated jobs/shifts and duplicate job reposts, keeping valid listings' IDs. - Jobs detail page always renders the location card (matches Shifts) instead of hiding it when coords are missing. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -58,6 +58,24 @@
|
||||
توصیهشده برای پاکسازیِ آمادهبهکارها: متنِ خام نگه داشته میشود و فقط با منطقِ جدید (یکنفر=یکآگهی، نقش پایه، گروه ثابت، تگ تمیز، موقعیت تقریبی) بازساخته میشوند. صفحاتِ «آماده به کار» ایندکس نمیشوند، پس آدرسِ ایندکسشدهای تغییر نمیکند؛ شیفت/استخدام بهمرور با ایمیجستِ تازه پاک میشوند.
|
||||
</p>
|
||||
|
||||
<form method="post" onsubmit="return confirm('برای آگهیهای جمعآوریشدهٔ تهران که موقعیت روی نقشه ندارند، از روی متنِ آگهی محلهٔ تقریبی پیدا و مختصات تنظیم میشود. شناسه و آدرس صفحات تغییر نمیکند (امن برای SEO). ادامه؟');">
|
||||
<button type="submit" asp-page-handler="BackfillCoords" class="btn btn-primary btn-block" style="margin-top:10px;">
|
||||
📍 تکمیل موقعیتِ نقشه برای آگهیهای موجود
|
||||
</button>
|
||||
</form>
|
||||
<p class="muted" style="font-size:11px; margin:6px 0 0;">
|
||||
شیفت/استخدام/آمادهبهکارِ جمعآوریشدهای که مختصات ندارند، از روی محلهٔ ذکرشده در متنِ آگهی روی نقشه قرار میگیرند (محدودهٔ تقریبی). فقط مختصاتِ خالی پر میشود؛ موقعیتِ واقعیِ مراکز دستنخورده میماند.
|
||||
</p>
|
||||
|
||||
<form method="post" onsubmit="return confirm('آگهیهای جمعآوریشدهٔ شیفت/استخدام که اکنون خارج از حوزهاند (خدمات منزل/نظافت، تبلیغاتی/آموزشی، اسپم) و استخدامهای تکراری حذف میشوند. آگهیهای معتبر و شناسه/آدرسشان دستنخورده میماند. این کار بازگشتناپذیر است. ادامه؟');">
|
||||
<button type="submit" asp-page-handler="PurgeInvalid" class="btn btn-outline btn-block" style="margin-top:10px; color:var(--danger); border-color:var(--danger);">
|
||||
🧽 حذفِ درجای آگهیهای خارج از حوزه و تکراری (شیفت/استخدام)
|
||||
</button>
|
||||
</form>
|
||||
<p class="muted" style="font-size:11px; margin:6px 0 0;">
|
||||
فقط آگهیهایی که با صافیِ فعلی «خارج از حوزه» تشخیص داده میشوند (نه صرفاً ناقص) و استخدامهای تکراری پاک میشوند. آگهیهای معتبر دستنخوردهاند، پس آدرسِ ایندکسشدهشان تغییر نمیکند؛ فقط صفحاتِ بد ۴۰۴ میشوند.
|
||||
</p>
|
||||
|
||||
<hr style="border:none; border-top:1px solid var(--line); margin:16px 0;" />
|
||||
|
||||
<h3>افزودن دستی</h3>
|
||||
|
||||
@@ -120,6 +120,30 @@ public class IndexModel : PageModel
|
||||
return RedirectToPage();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Fill missing map coordinates on existing aggregated Tehran listings from their stored ad text
|
||||
/// (TehranGeo). In place — no AI calls, no re-fetch, and crucially no delete/recreate, so indexed
|
||||
/// shift/job URLs keep their IDs. Fast (pure DB + string matching), so it runs inline.
|
||||
/// </summary>
|
||||
public async Task<IActionResult> OnPostBackfillCoordsAsync()
|
||||
{
|
||||
var n = await _ingest.BackfillCoordsAsync();
|
||||
IngestMessage = $"مختصات تقریبی برای {n} آگهی جمعآوریشده از روی متن آگهی تکمیل شد (بدون تغییر شناسه یا آدرس صفحه).";
|
||||
return RedirectToPage();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-place cleanup of existing aggregated jobs/shifts: delete only the out-of-scope ones
|
||||
/// (domestic-helper / promotional / spam) per the current validator, plus near-duplicate job
|
||||
/// reposts. Valid listings keep their IDs/URLs. No re-fetch, no AI — runs inline.
|
||||
/// </summary>
|
||||
public async Task<IActionResult> OnPostPurgeInvalidAsync()
|
||||
{
|
||||
var (removed, deduped) = await _ingest.PurgeInvalidAggregatedAsync();
|
||||
IngestMessage = $"پاکسازیِ درجا: {removed} آگهیِ خارج از حوزه (خدمات منزل/تبلیغاتی/اسپم) و {deduped} استخدامِ تکراری حذف شد. سایر آگهیها و شناسه/آدرسشان دستنخورده ماند.";
|
||||
return RedirectToPage();
|
||||
}
|
||||
|
||||
private async Task LoadAsync()
|
||||
{
|
||||
Queue = await _db.RawListings
|
||||
|
||||
@@ -161,12 +161,12 @@
|
||||
}
|
||||
</div>
|
||||
|
||||
@if (mapLat is not null && mapLng is not null)
|
||||
{
|
||||
var latS = mapLat.Value.ToString(System.Globalization.CultureInfo.InvariantCulture);
|
||||
var lngS = mapLng.Value.ToString(System.Globalization.CultureInfo.InvariantCulture);
|
||||
<div class="card card-pad" style="margin-top:16px;">
|
||||
<h3 style="margin-top:0;">موقعیت مکانی</h3>
|
||||
<div class="card card-pad" style="margin-top:16px;">
|
||||
<h3 style="margin-top:0;">موقعیت مکانی</h3>
|
||||
@if (mapLat is not null && mapLng is not null)
|
||||
{
|
||||
var latS = mapLat.Value.ToString(System.Globalization.CultureInfo.InvariantCulture);
|
||||
var lngS = mapLng.Value.ToString(System.Globalization.CultureInfo.InvariantCulture);
|
||||
@if (!string.IsNullOrEmpty(Model.MapKey))
|
||||
{
|
||||
<div id="facmap" data-lat="@latS" data-lng="@lngS" data-approx="@(mapApprox ? "true" : "false")" style="height:200px; border-radius:10px; overflow:hidden; border:1px solid var(--line);"></div>
|
||||
@@ -183,8 +183,12 @@
|
||||
}
|
||||
<a class="btn btn-outline btn-block" style="margin-top:8px;" target="_blank" rel="noopener"
|
||||
href="https://neshan.org/maps/@(latS),@(lngS),16z">مسیریابی در نشان</a>
|
||||
</div>
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
<p class="muted" style="margin:0;">مختصات این آگهی ثبت نشده است.</p>
|
||||
}
|
||||
</div>
|
||||
</aside>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -299,6 +299,117 @@ public class IngestionService
|
||||
return removed;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-place geocoding backfill: for existing AGGREGATED listings in Tehran that still have no map
|
||||
/// coords, derive an APPROXIMATE neighbourhood center from the stored ad text (TehranGeo) and fill
|
||||
/// Lat/Lng. Unlike <see cref="ReprocessAsync"/> it never deletes or recreates rows, so listing IDs —
|
||||
/// and the indexed shift/job URLs in the sitemap — are untouched; safe to run on the live board.
|
||||
/// Only ever FILLS a null coordinate; a real point (Divar/employer/AI) is never overwritten.
|
||||
/// Returns how many listings were newly placed on the map.
|
||||
/// </summary>
|
||||
public async Task<int> BackfillCoordsAsync(CancellationToken ct = default)
|
||||
{
|
||||
var tehran = await _db.Cities.FirstOrDefaultAsync(c => c.Name == "تهران", ct);
|
||||
if (tehran is null) return 0;
|
||||
int filled = 0;
|
||||
|
||||
var jobs = await _db.JobOpenings
|
||||
.Where(j => j.Lat == null && j.Source == ShiftSource.Aggregated && j.Facility.CityId == tehran.Id)
|
||||
.ToListAsync(ct);
|
||||
foreach (var j in jobs)
|
||||
if (TehranGeo.Locate(j.Description) is { } g) { j.Lat = g.lat; j.Lng = g.lng; filled++; }
|
||||
|
||||
var shifts = await _db.Shifts
|
||||
.Where(s => s.Lat == null && s.Source == ShiftSource.Aggregated && s.Facility.CityId == tehran.Id)
|
||||
.ToListAsync(ct);
|
||||
foreach (var s in shifts)
|
||||
if (TehranGeo.Locate(s.Description) is { } g) { s.Lat = g.lat; s.Lng = g.lng; filled++; }
|
||||
|
||||
var talent = await _db.TalentListings
|
||||
.Where(t => t.Lat == null && t.Source == ShiftSource.Aggregated && t.CityId == tehran.Id)
|
||||
.ToListAsync(ct);
|
||||
foreach (var t in talent)
|
||||
if (TehranGeo.Locate(t.AreaNote, t.Description) is { } g) { t.Lat = g.lat; t.Lng = g.lng; filled++; }
|
||||
|
||||
if (filled > 0) await _db.SaveChangesAsync(ct);
|
||||
_log.LogInformation("Coordinate backfill placed {N} aggregated listings on the map.", filled);
|
||||
return filled;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each listing's
|
||||
/// stored text through the CURRENT validator and delete only the ones that are now clearly
|
||||
/// out-of-scope — domestic-helper («امور منزل»), promotional/training, or spam (i.e.
|
||||
/// <see cref="ValidationResult.IsSpam"/>). Merely-incomplete-but-legit ads are KEPT. Then collapse
|
||||
/// near-duplicate job reposts. Valid listings are never touched, so their IDs — and indexed URLs —
|
||||
/// stay stable; only the bad pages 404 (which is the desired outcome). Returns (removed, deduped).
|
||||
/// </summary>
|
||||
public async Task<(int removed, int deduped)> PurgeInvalidAggregatedAsync(CancellationToken ct = default)
|
||||
{
|
||||
var roleNames = await _db.Roles.Select(r => r.Name).ToListAsync(ct);
|
||||
var cityNames = await _db.Cities.Select(c => c.Name).ToListAsync(ct);
|
||||
var districtNames = await _db.Districts.Select(d => d.Name).ToListAsync(ct);
|
||||
|
||||
bool IsOutOfScope(string? text)
|
||||
{
|
||||
var t = text ?? "";
|
||||
var parsed = _parser.Parse(t, roleNames, cityNames, districtNames);
|
||||
return _validator.Validate(t, parsed).IsSpam; // spam | promo | domestic-helper
|
||||
}
|
||||
|
||||
int removed = 0;
|
||||
|
||||
var jobIds = (await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated)
|
||||
.Select(j => new { j.Id, j.Description }).ToListAsync(ct))
|
||||
.Where(j => IsOutOfScope(j.Description)).Select(j => j.Id).ToList();
|
||||
if (jobIds.Count > 0)
|
||||
removed += await _db.JobOpenings.Where(j => jobIds.Contains(j.Id)).ExecuteDeleteAsync(ct);
|
||||
|
||||
var shiftIds = (await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated)
|
||||
.Select(s => new { s.Id, s.Description }).ToListAsync(ct))
|
||||
.Where(s => IsOutOfScope(s.Description)).Select(s => s.Id).ToList();
|
||||
if (shiftIds.Count > 0)
|
||||
removed += await _db.Shifts.Where(s => shiftIds.Contains(s.Id)).ExecuteDeleteAsync(ct);
|
||||
|
||||
var deduped = await DedupeJobsAsync(ct);
|
||||
_log.LogInformation("Purge removed {R} out-of-scope aggregated listings; deduped {D} jobs.", removed, deduped);
|
||||
return (removed, deduped);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Collapse near-duplicate aggregated JOB reposts the exact-hash dedupe missed (same ad re-crawled
|
||||
/// with slightly different surrounding text → different ContentHash). Signature = role + facility +
|
||||
/// normalized description core with digits/«… پیش» time-phrases stripped. Keeps the NEWEST of each
|
||||
/// group. Per-role fan-out of one ad is preserved (different RoleId → different signature).
|
||||
/// </summary>
|
||||
public async Task<int> DedupeJobsAsync(CancellationToken ct = default)
|
||||
{
|
||||
var rows = await _db.JobOpenings
|
||||
.Where(j => j.Status == ShiftStatus.Open && j.Source == ShiftSource.Aggregated)
|
||||
.Select(j => new { j.Id, j.RoleId, j.FacilityId, j.Description, j.CreatedAt })
|
||||
.ToListAsync(ct);
|
||||
|
||||
string? Sig(int roleId, int facId, string? desc)
|
||||
{
|
||||
var core = NormalizeFa(Regex.Replace(desc ?? "",
|
||||
@"[0-9۰-۹]+|روز پیش|ساعت پیش|هفته پیش|دقیقه پیش|دیروز|پریروز", " ")).Trim();
|
||||
if (core.Length < 15) return null; // too little to call it a dup safely
|
||||
return $"j:{roleId}:{facId}:{(core.Length > 120 ? core[..120] : core)}";
|
||||
}
|
||||
|
||||
var toRemove = rows
|
||||
.Select(r => new { r.Id, r.CreatedAt, Key = Sig(r.RoleId, r.FacilityId, r.Description) })
|
||||
.Where(x => x.Key is not null)
|
||||
.GroupBy(x => x.Key)
|
||||
.SelectMany(g => g.OrderByDescending(x => x.CreatedAt).Skip(1).Select(x => x.Id))
|
||||
.ToList();
|
||||
|
||||
if (toRemove.Count == 0) return 0;
|
||||
var removed = await _db.JobOpenings.Where(j => toRemove.Contains(j.Id)).ExecuteDeleteAsync(ct);
|
||||
_log.LogInformation("Deduped {N} near-duplicate aggregated jobs.", removed);
|
||||
return removed;
|
||||
}
|
||||
|
||||
private static string DigitsOnly(string s) => new(HtmlUtil.ToLatinDigits(s).Where(char.IsDigit).ToArray());
|
||||
|
||||
private static (RawListingStatus status, string? reason, int confidence) Decide(
|
||||
@@ -366,8 +477,11 @@ public class IngestionService
|
||||
// Tehran ad that only NAMES a neighborhood (Medjobs/Telegram), geocode that name to a rough
|
||||
// center. Shown as a «محدودهٔ تقریبی» circle, never a precise pin.
|
||||
double? appLat = raw.Lat, appLng = raw.Lng;
|
||||
// Geocode from the structured location fields first, then fall back to scanning the ad body
|
||||
// itself — many Tehran ads name the neighbourhood only in free text («… نیم ساعت پیش در سهروردی»)
|
||||
// and never populate a district/area field, which is why most aggregated listings had no map.
|
||||
if (appLat is null && city.Name == "تهران"
|
||||
&& TehranGeo.Locate(district?.Name, districtName, parsed.AreaNote) is { } g)
|
||||
&& TehranGeo.Locate(district?.Name, districtName, parsed.AreaNote, raw.RawText) is { } g)
|
||||
{ appLat = g.lat; appLng = g.lng; }
|
||||
// Last resort — the AI model's inferred coords, but ONLY when they fall inside greater Tehran
|
||||
// (rejects a hallucinated point elsewhere). Uses the registered model where the rules can't decide.
|
||||
@@ -446,7 +560,10 @@ public class IngestionService
|
||||
Facility = facility, Role = role,
|
||||
Title = !string.IsNullOrWhiteSpace(d?.Title) && pubRoles.Count == 1 ? d!.Title!.Trim() : $"استخدام {role.Name}",
|
||||
EmploymentType = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
|
||||
SalaryMin = parsed.PayAmount,
|
||||
// Prefer the AI-extracted salary, falling back to the parser's — matching the talent
|
||||
// path. (Jobs previously used only parsed.PayAmount, silently dropping the AI figure,
|
||||
// so every aggregated opening showed «توافقی» even when the ad stated a number.)
|
||||
SalaryMin = d?.PayAmount ?? parsed.PayAmount,
|
||||
Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated,
|
||||
SourceUrl = raw.SourceUrl,
|
||||
Lat = appLat, Lng = appLng, // source point (Divar) or geocoded neighborhood center
|
||||
|
||||
Reference in New Issue
Block a user