Auto-clean the board after every crawl (no manual cleanup clicks)
CI/CD / CI · dotnet build (push) Successful in 2m34s
CI/CD / Deploy · hamkadr (push) Successful in 2m4s

RunAsync now calls a new RunPostIngestCleanupAsync at the end of each crawl: archive
out-of-scope/duplicate listings, merge duplicate + fold junk facilities, and backfill missing
Tehran coords. All in-place, reversible for listings, guarded for facilities, and pure DB+CPU
(no AI/network) so it is cheap to run every ingest. The cleanup counts are appended to the
run-log detail. This keeps legacy + freshly-arrived junk from accumulating without the admin
having to click the cleanup buttons after each run.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-21 13:19:11 +03:30
parent bb8c6c3be5
commit b48e7dbc65
@@ -149,11 +149,16 @@ public class IngestionService
await DedupeTalentAsync(ct); // collapse same-ad reposts the exact-hash dedup can't catch
// Self-clean after every crawl so the board stays tidy with no manual admin clicks: archive
// out-of-scope/duplicate listings, merge duplicate + fold junk facilities, backfill coords.
var cleanup = results.Count > 0 ? await RunPostIngestCleanupAsync(ct) : default;
// Persist a run-log row so admins get a crawl history (with a per-source breakdown).
if (results.Count > 0)
{
var detail = string.Join("؛ ", results.Select(r =>
$"{r.Source}: یافت {r.Fetched}، صف {r.Queued}، منتشر {r.Published}، پرچم {r.Flagged}، اسپم {r.Spam}، تکراری {r.Duplicates}"));
$"{r.Source}: یافت {r.Fetched}، صف {r.Queued}، منتشر {r.Published}، پرچم {r.Flagged}، اسپم {r.Spam}، تکراری {r.Duplicates}"))
+ $" || پاک‌سازیِ خودکار: {cleanup.archived} بایگانی، {cleanup.dedupedJobs} استخدامِ تکراری، {cleanup.mergedFac} مرکزِ ادغام، {cleanup.cleanedFac} مرکزِ حذف، {cleanup.coords} مختصات";
_db.IngestionRuns.Add(new IngestionRun
{
Fetched = summary.TotalFetched,
@@ -336,6 +341,24 @@ public class IngestionService
return filled;
}
/// <summary>
/// The self-cleaning pass run automatically at the end of every crawl (and available on demand):
/// archive out-of-scope/duplicate listings, merge duplicate + fold junk facilities, and backfill
/// missing Tehran map coords. All in-place — reversible (archive, not delete) for listings, guarded
/// (never touches employer/verified facilities) — and pure DB + CPU (no AI, no network), so it's
/// cheap to run on every ingest. Keeps the board tidy without the admin clicking the cleanup buttons.
/// </summary>
public async Task<(int archived, int dedupedJobs, int mergedFac, int cleanedFac, int coords)>
RunPostIngestCleanupAsync(CancellationToken ct = default)
{
var (archived, dedupedJobs) = await PurgeInvalidAggregatedAsync(ct);
var (mergedFac, cleanedFac) = await MergeAndCleanFacilitiesAsync(ct);
var coords = await BackfillCoordsAsync(ct);
_log.LogInformation("Post-ingest cleanup: archived={A} dedupedJobs={DJ} mergedFac={MF} cleanedFac={CF} coords={C}",
archived, dedupedJobs, mergedFac, cleanedFac, coords);
return (archived, dedupedJobs, mergedFac, cleanedFac, coords);
}
/// <summary>
/// SEO-safe in-place cleanup of the existing AGGREGATED job/shift board: re-screen each Open
/// listing's stored text through the CURRENT validator and ARCHIVE (Status → Archived, not delete)