Add scrape/ingestion engine + validation, and 24h shift hour-range visualization

Scrape engine (Services/Scraping/): pluggable IListingSource (working sample + Telegram/Divar credential-ready stubs) → IngestionService (content-hash dedupe → parse → validate → review queue) → ListingValidator (completeness score + spam screen) → IngestionWorker (config-gated hosted service). RawListing gains ContentHash/Confidence/ValidationNotes; RawListingStatus.Flagged. Admin /Admin gets run-now, source list, confidence + flagged queue.

Hour-range viz: _HourBar 24h timeline bar (colored by type, overnight wrap) on shift cards, recommendation cards, and detail.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-03 08:18:19 +03:30
parent 69fa921fbd
commit 931b7b6ffb
24 changed files with 1439 additions and 26 deletions
@@ -0,0 +1,42 @@
using Microsoft.Extensions.Options;
namespace JobsMedical.Web.Services.Scraping;
public class DivarOptions
{
public bool Enabled { get; set; }
public string? City { get; set; } // e.g. "tehran"
public List<string> Queries { get; set; } = new(); // search terms, e.g. "استخدام پزشک"
}
/// <summary>
/// Divar source. Credential-ready: configure city + queries in (Ingestion:Divar) and implement
/// the fetch against Divar's listing API/HTML. Dormant until enabled.
/// </summary>
public class DivarListingSource : IListingSource
{
private readonly DivarOptions _opts;
private readonly ILogger<DivarListingSource> _log;
public DivarListingSource(IOptions<DivarOptions> opts, ILogger<DivarListingSource> log)
{
_opts = opts.Value;
_log = log;
}
public string Name => "دیوار";
public bool Enabled => _opts.Enabled && _opts.Queries.Count > 0;
public Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default)
{
if (!Enabled)
{
_log.LogInformation("Divar source not configured — skipping.");
return Task.FromResult<IReadOnlyList<ScrapedItem>>(Array.Empty<ScrapedItem>());
}
// TODO(prod): query Divar for each term in the configured city, map each ad's
// title+description to new ScrapedItem(Name, text, adUrl).
_log.LogWarning("Divar fetch not yet implemented; returning empty.");
return Task.FromResult<IReadOnlyList<ScrapedItem>>(Array.Empty<ScrapedItem>());
}
}
@@ -0,0 +1,15 @@
namespace JobsMedical.Web.Services.Scraping;
/// <summary>One raw post pulled from a source (a Telegram message, a Divar ad, etc.).</summary>
public record ScrapedItem(string Source, string RawText, string? SourceUrl = null);
/// <summary>
/// A pluggable source the ingestion engine pulls from. Implement once per channel/site.
/// `Enabled` lets a source be present but dormant until it's configured with credentials.
/// </summary>
public interface IListingSource
{
string Name { get; }
bool Enabled { get; }
Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default);
}
@@ -0,0 +1,107 @@
using System.Security.Cryptography;
using System.Text;
using System.Text.RegularExpressions;
using JobsMedical.Web.Data;
using JobsMedical.Web.Models;
using Microsoft.EntityFrameworkCore;
namespace JobsMedical.Web.Services.Scraping;
public record SourceResult(string Source, int Fetched, int Queued, int Flagged, int Spam, int Duplicates);
public record IngestionSummary(List<SourceResult> Sources)
{
public int TotalQueued => Sources.Sum(s => s.Queued);
public int TotalFlagged => Sources.Sum(s => s.Flagged);
public int TotalSpam => Sources.Sum(s => s.Spam);
public int TotalDuplicates => Sources.Sum(s => s.Duplicates);
}
/// <summary>
/// The scrape engine. Pulls from every enabled <see cref="IListingSource"/>, dedupes by content
/// hash, parses with <see cref="IListingParser"/>, validates with <see cref="ListingValidator"/>,
/// and stores each as a <see cref="RawListing"/> with a status: New (queued for review),
/// Flagged (incomplete/suspicious), or Discarded (spam). Source-agnostic — add a source and it
/// flows through unchanged.
/// </summary>
public class IngestionService
{
private readonly AppDbContext _db;
private readonly IEnumerable<IListingSource> _sources;
private readonly IListingParser _parser;
private readonly ListingValidator _validator;
private readonly ILogger<IngestionService> _log;
public IngestionService(AppDbContext db, IEnumerable<IListingSource> sources,
IListingParser parser, ListingValidator validator, ILogger<IngestionService> log)
{
_db = db;
_sources = sources;
_parser = parser;
_validator = validator;
_log = log;
}
public IReadOnlyList<(string Name, bool Enabled)> Sources =>
_sources.Select(s => (s.Name, s.Enabled)).ToList();
public async Task<IngestionSummary> RunAsync(CancellationToken ct = default)
{
var roles = await _db.Roles.Select(r => r.Name).ToListAsync(ct);
var cities = await _db.Cities.Select(c => c.Name).ToListAsync(ct);
var districts = await _db.Districts.Select(d => d.Name).ToListAsync(ct);
var results = new List<SourceResult>();
foreach (var source in _sources.Where(s => s.Enabled))
{
int fetched = 0, queued = 0, flagged = 0, spam = 0, dupes = 0;
IReadOnlyList<ScrapedItem> items;
try { items = await source.FetchAsync(ct); }
catch (Exception ex) { _log.LogError(ex, "Source {Source} fetch failed", source.Name); continue; }
foreach (var item in items)
{
fetched++;
var hash = Hash(item.RawText);
if (await _db.RawListings.AnyAsync(r => r.ContentHash == hash, ct)) { dupes++; continue; }
var parsed = _parser.Parse(item.RawText, roles, cities, districts);
var val = _validator.Validate(item.RawText, parsed);
var status = val.IsSpam ? RawListingStatus.Discarded
: val.IsValid ? RawListingStatus.New
: RawListingStatus.Flagged;
if (status == RawListingStatus.New) queued++;
else if (status == RawListingStatus.Flagged) flagged++;
else spam++;
_db.RawListings.Add(new RawListing
{
SourceChannel = item.Source,
SourceUrl = item.SourceUrl,
RawText = item.RawText.Trim(),
ContentHash = hash,
Confidence = val.Confidence,
ValidationNotes = val.Issues.Count > 0 ? string.Join("؛ ", val.Issues) : null,
Status = status,
});
}
await _db.SaveChangesAsync(ct);
results.Add(new SourceResult(source.Name, fetched, queued, flagged, spam, dupes));
_log.LogInformation("Ingestion {Source}: fetched={F} queued={Q} flagged={Fl} spam={S} dupes={D}",
source.Name, fetched, queued, flagged, spam, dupes);
}
return new IngestionSummary(results);
}
/// <summary>SHA-256 hex of the whitespace-normalized text (for cross-run dedupe).</summary>
private static string Hash(string text)
{
var normalized = Regex.Replace((text ?? "").Trim(), @"\s+", " ");
var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(normalized));
return Convert.ToHexString(bytes).ToLowerInvariant();
}
}
@@ -0,0 +1,59 @@
using Microsoft.Extensions.Options;
namespace JobsMedical.Web.Services.Scraping;
public class IngestionOptions
{
public bool Enabled { get; set; } = false; // off by default — opt in via config
public int IntervalMinutes { get; set; } = 30;
}
/// <summary>
/// Periodically runs the ingestion engine when enabled (Ingestion:Enabled=true). Off by default
/// so nothing scrapes uninvited; admins can also trigger a run on demand from the admin UI.
/// </summary>
public class IngestionWorker : BackgroundService
{
private readonly IServiceScopeFactory _scopes;
private readonly IngestionOptions _opts;
private readonly ILogger<IngestionWorker> _log;
public IngestionWorker(IServiceScopeFactory scopes, IOptions<IngestionOptions> opts,
ILogger<IngestionWorker> log)
{
_scopes = scopes;
_opts = opts.Value;
_log = log;
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
if (!_opts.Enabled)
{
_log.LogInformation("Ingestion worker disabled (Ingestion:Enabled=false).");
return;
}
var interval = TimeSpan.FromMinutes(Math.Max(1, _opts.IntervalMinutes));
_log.LogInformation("Ingestion worker on; every {Min} min.", _opts.IntervalMinutes);
while (!stoppingToken.IsCancellationRequested)
{
try
{
using var scope = _scopes.CreateScope();
var svc = scope.ServiceProvider.GetRequiredService<IngestionService>();
var summary = await svc.RunAsync(stoppingToken);
_log.LogInformation("Scheduled ingestion: queued={Q} flagged={F} spam={S} dupes={D}",
summary.TotalQueued, summary.TotalFlagged, summary.TotalSpam, summary.TotalDuplicates);
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
_log.LogError(ex, "Scheduled ingestion run failed");
}
try { await Task.Delay(interval, stoppingToken); }
catch (OperationCanceledException) { break; }
}
}
}
@@ -0,0 +1,63 @@
using System.Text.RegularExpressions;
using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services.Scraping;
public record ValidationResult(bool IsValid, bool IsSpam, int Confidence, List<string> Issues);
/// <summary>
/// Scores a parsed listing for completeness and screens out spam. A listing must look like a
/// real medical shift/job (role + a location or pay signal, plausible length, contact) to pass.
/// The confidence drives whether it lands in the review queue (New), gets Flagged for a closer
/// look, or is auto-discarded as spam.
/// </summary>
public class ListingValidator
{
// Posts that smell like ads/scams rather than medical shifts.
private static readonly string[] SpamMarkers =
{
"سرمایه گذاری", "سرمایه‌گذاری", "وام", "ارز دیجیتال", "رمز ارز", "فروش فالوور",
"بک لینک", "تبلیغات", "قرعه کشی", "جایزه", "کازینو", "شرط بندی", "بیت کوین"
};
private static readonly string[] MedicalMarkers =
{
"شیفت", "درمانگاه", "بیمارستان", "کلینیک", "پزشک", "پرستار", "ماما", "تکنسین",
"اورژانس", "استخدام", "کادر درمان", "مطب", "آنکال", "کشیک"
};
public ValidationResult Validate(string rawText, ParsedListing parsed)
{
var issues = new List<string>();
var text = rawText ?? "";
bool isSpam = SpamMarkers.Any(text.Contains)
&& !MedicalMarkers.Any(text.Contains);
if (isSpam) issues.Add("به‌نظر اسپم/تبلیغاتی است");
bool looksMedical = MedicalMarkers.Any(text.Contains);
if (!looksMedical) issues.Add("نشانه‌ای از حوزه درمان یافت نشد");
int score = 0;
if (parsed.RoleName is not null) score += 30; else issues.Add("نقش مشخص نیست");
if (parsed.CityName is not null || parsed.DistrictName is not null) score += 20;
else issues.Add("شهر/محل مشخص نیست");
if (parsed.PayAmount is not null || parsed.SharePercent is not null || parsed.PayNegotiable)
score += 20; else issues.Add("اطلاعات پرداخت یافت نشد");
if (parsed.Phone is not null) score += 15; else issues.Add("شماره تماس یافت نشد");
if (parsed.Kind == ListingKind.Shift && parsed.ShiftType is not null) score += 10;
if (looksMedical) score += 5;
// Sanity on length — a few words isn't a real listing; a wall of text is suspicious.
var len = text.Trim().Length;
if (len < 25) { score -= 20; issues.Add("متن خیلی کوتاه است"); }
if (len > 1500) { score -= 10; issues.Add("متن غیرعادی بلند است"); }
if (Regex.Matches(text, @"https?://").Count >= 3) { score -= 15; issues.Add("لینک‌های متعدد"); }
score = Math.Clamp(score, 0, 100);
// Valid enough for the queue if it's medical, not spam, and reasonably complete.
bool isValid = !isSpam && looksMedical && score >= 50;
return new ValidationResult(isValid, isSpam, score, issues);
}
}
@@ -0,0 +1,27 @@
namespace JobsMedical.Web.Services.Scraping;
/// <summary>
/// A built-in source of representative Persian posts (the kind found in shift channels). Always
/// available, needs no credentials — it lets the whole ingestion → validation → review pipeline
/// run and be demoed today, and doubles as a fixture mix of good, incomplete, and spam posts.
/// </summary>
public class SampleListingSource : IListingSource
{
public string Name => "نمونه (کانال آزمایشی)";
public bool Enabled => true;
private static readonly string[] Posts =
{
"درمانگاه شبانه‌روزی در سعادت‌آباد نیازمند پزشک عمومی برای شیفت شب، کارانه ۳ میلیون تومان. تماس ۰۹۱۲۳۴۵۶۷۸۹",
"کلینیک تخصصی در تهران به پرستار برای شیفت عصر نیازمند است، ۵۰٪ سهم درآمد. ۰۹۳۵۱۱۱۲۲۳۳",
"استخدام ماما تمام‌وقت در بیمارستان خصوصی، حقوق توافقی. منطقه شهرک غرب.",
"نیازمند تکنسین اتاق عمل جهت همکاری در نارمک، شیفت صبح. ۰۹۱۲۰۰۰۰۰۰۰",
"فروش فالوور و بک لینک ارزان، سرمایه گذاری در ارز دیجیتال با سود تضمینی!", // spam
"پزشک", // too short / incomplete
"بیمارستان آتیه جهت تکمیل کادر درمان به پزشک عمومی مقیم نیازمند است. قرارداد یک‌ساله، حقوق ۴۵ میلیون ماهانه. تهرانپارس.",
};
public Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default)
=> Task.FromResult<IReadOnlyList<ScrapedItem>>(
Posts.Select(p => new ScrapedItem(Name, p)).ToList());
}
@@ -0,0 +1,44 @@
using Microsoft.Extensions.Options;
namespace JobsMedical.Web.Services.Scraping;
public class TelegramOptions
{
public bool Enabled { get; set; }
public string? BotToken { get; set; }
public List<string> Channels { get; set; } = new(); // @channel handles to read
}
/// <summary>
/// Telegram/Bale channel source. Credential-ready: wire a bot token + channel list in config
/// (Ingestion:Telegram) and implement the fetch against the Bot API (getUpdates / channel posts)
/// or a userbot. Dormant until enabled, so the engine runs without it.
/// </summary>
public class TelegramListingSource : IListingSource
{
private readonly TelegramOptions _opts;
private readonly ILogger<TelegramListingSource> _log;
public TelegramListingSource(IOptions<TelegramOptions> opts, ILogger<TelegramListingSource> log)
{
_opts = opts.Value;
_log = log;
}
public string Name => "تلگرام/بله";
public bool Enabled => _opts.Enabled && !string.IsNullOrWhiteSpace(_opts.BotToken) && _opts.Channels.Count > 0;
public Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default)
{
if (!Enabled)
{
_log.LogInformation("Telegram source not configured — skipping.");
return Task.FromResult<IReadOnlyList<ScrapedItem>>(Array.Empty<ScrapedItem>());
}
// TODO(prod): call https://api.telegram.org/bot{token}/getUpdates (or channel history),
// map each message to new ScrapedItem(Name, message.Text, messageLink). The validation +
// dedupe pipeline downstream is already source-agnostic.
_log.LogWarning("Telegram fetch not yet implemented; returning empty.");
return Task.FromResult<IReadOnlyList<ScrapedItem>>(Array.Empty<ScrapedItem>());
}
}