b3e7123d74
Parser: most jobs read «توافقی» because the amount extractor only saw 6–10 digit numbers, missing the way Iranian ads actually state pay — «۱۵ تومان»، «۴۰ تا ۵۰ تومان»، «۲۰ میلیون»، «۲۰م» all mean MILLIONS of toman. Add colloquial detection (1–3 digit number + تومان/م/میلیون → ×1,000,000, lower bound of a range), guarded so it never matches dates/hours or a long literal-toman figure. Also: a stated amount now wins over «توافقی» (ads often say a number AND «… بقیه توافقی»). Backfill: BackfillPayAsync re-parses existing aggregated jobs/talent that have no salary and fills it in place (no AI, no ID/URL change) — wired into the post-ingest auto-cleanup and exposed as an admin button. Existing «توافقی» listings with a stated number get their salary; genuinely-negotiable ads stay توافقی. Also improves the baseSalary in JobPosting rich results. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
215 lines
12 KiB
C#
215 lines
12 KiB
C#
using JobsMedical.Web.Data;
|
|
using JobsMedical.Web.Models;
|
|
using JobsMedical.Web.Services.Scraping;
|
|
using Microsoft.AspNetCore.Authorization;
|
|
using Microsoft.AspNetCore.Mvc;
|
|
using Microsoft.AspNetCore.Mvc.RazorPages;
|
|
using Microsoft.EntityFrameworkCore;
|
|
|
|
namespace JobsMedical.Web.Pages.Admin;
|
|
|
|
[Authorize(Roles = "Admin")]
|
|
public class IndexModel : PageModel
|
|
{
|
|
private readonly AppDbContext _db;
|
|
private readonly IngestionService _ingest;
|
|
private readonly IServiceScopeFactory _scopes;
|
|
private readonly ILogger<IndexModel> _log;
|
|
|
|
public IndexModel(AppDbContext db, IngestionService ingest, IServiceScopeFactory scopes, ILogger<IndexModel> log)
|
|
{
|
|
_db = db;
|
|
_ingest = ingest;
|
|
_scopes = scopes;
|
|
_log = log;
|
|
}
|
|
|
|
public List<RawListing> Queue { get; private set; } = new();
|
|
public List<RawListing> Flagged { get; private set; } = new();
|
|
public const int PageSize = 20;
|
|
public int QueuePage { get; private set; } = 1;
|
|
public int QueueTotal { get; private set; }
|
|
public int FlaggedPage { get; private set; } = 1;
|
|
public int FlaggedTotal { get; private set; }
|
|
public int QueuePages => Math.Max(1, (int)Math.Ceiling(QueueTotal / (double)PageSize));
|
|
public int FlaggedPages => Math.Max(1, (int)Math.Ceiling(FlaggedTotal / (double)PageSize));
|
|
public IReadOnlyList<string> SourceNames { get; private set; } = new List<string>();
|
|
public int PublishedShifts { get; private set; }
|
|
public int PublishedJobs { get; private set; }
|
|
public List<IngestionRun> Runs { get; private set; } = new();
|
|
|
|
[BindProperty] public string? SourceChannel { get; set; }
|
|
[BindProperty] public string? RawText { get; set; }
|
|
|
|
[TempData] public string? IngestMessage { get; set; }
|
|
|
|
public async Task OnGetAsync(int q = 1, int f = 1) => await LoadAsync(q, f);
|
|
|
|
public async Task<IActionResult> OnPostAddAsync()
|
|
{
|
|
if (!string.IsNullOrWhiteSpace(RawText))
|
|
{
|
|
_db.RawListings.Add(new RawListing
|
|
{
|
|
SourceChannel = string.IsNullOrWhiteSpace(SourceChannel) ? "ورود دستی" : SourceChannel.Trim(),
|
|
RawText = RawText.Trim(),
|
|
Status = RawListingStatus.New,
|
|
});
|
|
await _db.SaveChangesAsync();
|
|
}
|
|
return RedirectToPage();
|
|
}
|
|
|
|
/// <summary>Fast triage — reject (discard) a queued/flagged item without opening the review page.</summary>
|
|
public async Task<IActionResult> OnPostQuickDiscardAsync(int id)
|
|
{
|
|
var raw = await _db.RawListings.FirstOrDefaultAsync(r => r.Id == id);
|
|
if (raw is not null) { raw.Status = RawListingStatus.Discarded; await _db.SaveChangesAsync(); }
|
|
return RedirectToPage();
|
|
}
|
|
|
|
public async Task<IActionResult> OnPostRunIngestionAsync()
|
|
{
|
|
var s = await _ingest.RunAsync();
|
|
IngestMessage = $"جمعآوری انجام شد — {s.TotalQueued} در صف، {s.TotalFlagged} پرچمخورده، " +
|
|
$"{s.TotalSpam} اسپم، {s.TotalDuplicates} تکراری.";
|
|
return RedirectToPage();
|
|
}
|
|
|
|
/// <summary>
|
|
/// DESTRUCTIVE rebuild, in two distinct deletes:
|
|
/// 1. The DEDUPE CACHE — ALL RawListings, including any added via «افزودن دستی». These are not
|
|
/// published content; they're the crawl/staging rows whose ContentHash blocks re-ingesting
|
|
/// the same ad. Wiping them lets everything be re-fetched and re-judged by the AI.
|
|
/// 2. AGGREGATED listings only — Shifts/JobOpenings/TalentListings with Source==Aggregated, i.e.
|
|
/// produced by ingestion. Employer/admin-posted listings (Source==Direct) are left untouched.
|
|
/// Then re-fetch everything and re-run it through the (now AI-enabled) pipeline.
|
|
/// RawListings are deleted first so their LinkedShift/LinkedTalent FKs (SetNull) don't dangle;
|
|
/// DB cascade clears ContactMethods / Applications / InterestEvents when the posts are deleted.
|
|
/// </summary>
|
|
public async Task<IActionResult> OnPostPurgeAndReingestAsync()
|
|
{
|
|
int rawCount, shifts, jobs, talent;
|
|
await using (var tx = await _db.Database.BeginTransactionAsync())
|
|
{
|
|
rawCount = await _db.RawListings.ExecuteDeleteAsync(); // clear dedupe cache
|
|
shifts = await _db.Shifts.Where(s => s.Source == ShiftSource.Aggregated).ExecuteDeleteAsync();
|
|
jobs = await _db.JobOpenings.Where(j => j.Source == ShiftSource.Aggregated).ExecuteDeleteAsync();
|
|
talent = await _db.TalentListings.Where(t => t.Source == ShiftSource.Aggregated).ExecuteDeleteAsync();
|
|
await tx.CommitAsync();
|
|
}
|
|
|
|
var s = await _ingest.RunAsync(); // fresh fetch → AI audit → publish/queue
|
|
IngestMessage = $"پاکسازی شد (حذف: {rawCount} آیتم کش، {shifts} شیفت، {jobs} استخدام، {talent} آمادهبهکارِ جمعآوریشده). " +
|
|
$"جمعآوری مجدد: {s.TotalPublished} منتشر، {s.TotalQueued} در صف، {s.TotalFlagged} پرچم، {s.TotalSpam} اسپم، {s.TotalDuplicates} تکراری.";
|
|
return RedirectToPage();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Clean up EXISTING aggregated content by re-running the current pipeline over the stored raw
|
|
/// text — no re-fetch, so nothing is lost to sources only exposing recent posts. Long-running
|
|
/// (one AI call per item), so it runs on a background scope and returns immediately; the result
|
|
/// shows up as a new row in the «تاریخچهٔ اجرا» log when it finishes.
|
|
/// </summary>
|
|
public IActionResult OnPostReprocessStored()
|
|
{
|
|
_ = Task.Run(async () =>
|
|
{
|
|
using var scope = _scopes.CreateScope();
|
|
var svc = scope.ServiceProvider.GetRequiredService<IngestionService>();
|
|
var log = scope.ServiceProvider.GetRequiredService<ILogger<IndexModel>>();
|
|
// talentOnly: «آماده به کار» is NoIndex/Disallow → rebuilding it doesn't churn any indexed
|
|
// URL. Shift/Job detail pages ARE indexed, so they're left to self-clean via turnover.
|
|
try { await svc.ReprocessAsync(talentOnly: true); }
|
|
catch (Exception ex) { log.LogError(ex, "Background reprocess failed"); }
|
|
});
|
|
IngestMessage = "پردازش مجدد آیتمهای ذخیرهشده در پسزمینه آغاز شد. نتیجه پس از اتمام در «تاریخچهٔ اجرا» نمایش داده میشود (بسته به تعداد آیتمها و سرعت هوش مصنوعی، چند دقیقه طول میکشد).";
|
|
return RedirectToPage();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Fill missing map coordinates on existing aggregated Tehran listings from their stored ad text
|
|
/// (TehranGeo). In place — no AI calls, no re-fetch, and crucially no delete/recreate, so indexed
|
|
/// shift/job URLs keep their IDs. Fast (pure DB + string matching), so it runs inline.
|
|
/// </summary>
|
|
public async Task<IActionResult> OnPostBackfillCoordsAsync()
|
|
{
|
|
var n = await _ingest.BackfillCoordsAsync();
|
|
IngestMessage = $"مختصات تقریبی برای {n} آگهی جمعآوریشده از روی متن آگهی تکمیل شد (بدون تغییر شناسه یا آدرس صفحه).";
|
|
return RedirectToPage();
|
|
}
|
|
|
|
/// <summary>Fill missing salary on existing aggregated listings from the stored text (now reading
|
|
/// Iranian «X تومان» = millions shorthand). In place — no AI, no ID/URL change.</summary>
|
|
public async Task<IActionResult> OnPostBackfillPayAsync()
|
|
{
|
|
var n = await _ingest.BackfillPayAsync();
|
|
IngestMessage = $"حقوق برای {n} آگهیِ «توافقی» که در متن مبلغ داشت (مثل «۴۰ تا ۵۰ تومان») استخراج و ثبت شد. بدون تغییر شناسه/آدرس.";
|
|
return RedirectToPage();
|
|
}
|
|
|
|
/// <summary>
|
|
/// In-place cleanup of existing aggregated jobs/shifts: ARCHIVE (hide, keep the row) only the
|
|
/// out-of-scope ones (domestic-helper / promotional / spam) per the current validator, plus
|
|
/// near-duplicate job reposts. Archived pages drop from lists + sitemap and return 410 Gone.
|
|
/// Valid listings keep their IDs/URLs. Reversible, no re-fetch, no AI — runs inline.
|
|
/// </summary>
|
|
public async Task<IActionResult> OnPostPurgeInvalidAsync()
|
|
{
|
|
var (archived, deduped) = await _ingest.PurgeInvalidAggregatedAsync();
|
|
IngestMessage = $"بایگانیِ درجا: {archived} آگهیِ خارج از حوزه (خدمات منزل/تبلیغاتی/اسپم) و {deduped} استخدامِ تکراری از سایت پنهان شد (وضعیت «بایگانی»؛ ردیف نگه داشته شد و قابل بازگشت است؛ صفحهشان ۴۱۰ Gone میدهد). آگهیهای معتبر و شناسه/آدرسشان دستنخورده ماند.";
|
|
return RedirectToPage();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Clean up the crawl-generated facility table: merge Persian-fuzzy duplicate facilities and fold
|
|
/// junk-named ones («بیمارستان هستم»، «... از مدجابز»، bare «کلینیک») into the shared placeholder,
|
|
/// repointing their listings first. Employer-owned / verified facilities are never touched.
|
|
/// </summary>
|
|
public async Task<IActionResult> OnPostCleanFacilitiesAsync()
|
|
{
|
|
var (merged, cleaned) = await _ingest.MergeAndCleanFacilitiesAsync();
|
|
IngestMessage = $"پاکسازی مراکز: {merged} مرکزِ تکراری ادغام و {cleaned} مرکزِ بینام/نامعتبر حذف شد (آگهیهایشان به مرکزِ معتبر یا «نامشخص» منتقل شد). مراکز ثبتشده توسط کارفرما/تأییدشده دستنخورده ماند.";
|
|
return RedirectToPage();
|
|
}
|
|
|
|
/// <summary>Fix existing aggregated listings the AI mislabeled «پزشک عمومی» (dentist/specialist/…)
|
|
/// in place from their stored text — no AI, no ID/URL change.</summary>
|
|
public async Task<IActionResult> OnPostRecorrectRolesAsync()
|
|
{
|
|
var n = await _ingest.RecorrectDoctorRolesAsync();
|
|
IngestMessage = $"اصلاح نقش: {n} آگهیِ «پزشک عمومی» که در واقع نقش دیگری بود (دندانپزشک، متخصص و …) از روی متن آگهی اصلاح شد. بدون تغییر شناسه یا آدرس صفحه.";
|
|
return RedirectToPage();
|
|
}
|
|
|
|
/// <summary>Auto-merge duplicate/compound/typo roles minted by the dynamic taxonomy
|
|
/// («پرستار کودک» ×3، «پرستار و بهیار»، «بیهیار»→بهیار), repointing all listings first.</summary>
|
|
public async Task<IActionResult> OnPostMergeRolesAsync()
|
|
{
|
|
var n = await _ingest.MergeDuplicateRolesAsync();
|
|
IngestMessage = $"پاکسازی نقشها: {n} نقشِ تکراری/ترکیبی/غلطاملایی در نقشهای اصلی ادغام شد (آگهیهایشان منتقل شد). فهرست نقشها اکنون تمیزتر است.";
|
|
return RedirectToPage();
|
|
}
|
|
|
|
private async Task LoadAsync(int q = 1, int f = 1)
|
|
{
|
|
QueueTotal = await _db.RawListings.CountAsync(r => r.Status == RawListingStatus.New);
|
|
QueuePage = Math.Clamp(q, 1, QueuePages);
|
|
Queue = await _db.RawListings
|
|
.Where(r => r.Status == RawListingStatus.New)
|
|
.OrderByDescending(r => r.Confidence).ThenByDescending(r => r.FetchedAt)
|
|
.Skip((QueuePage - 1) * PageSize).Take(PageSize).ToListAsync();
|
|
|
|
FlaggedTotal = await _db.RawListings.CountAsync(r => r.Status == RawListingStatus.Flagged);
|
|
FlaggedPage = Math.Clamp(f, 1, FlaggedPages);
|
|
Flagged = await _db.RawListings
|
|
.Where(r => r.Status == RawListingStatus.Flagged)
|
|
.OrderByDescending(r => r.FetchedAt)
|
|
.Skip((FlaggedPage - 1) * PageSize).Take(PageSize).ToListAsync();
|
|
SourceNames = _ingest.SourceNames;
|
|
PublishedShifts = await _db.Shifts.CountAsync(s => s.Source != ShiftSource.Direct);
|
|
PublishedJobs = await _db.JobOpenings.CountAsync();
|
|
Runs = await _db.IngestionRuns.OrderByDescending(r => r.RunAt).Take(15).ToListAsync();
|
|
}
|
|
}
|