Add scrape/ingestion engine + validation, and 24h shift hour-range visualization
Scrape engine (Services/Scraping/): pluggable IListingSource (working sample + Telegram/Divar credential-ready stubs) → IngestionService (content-hash dedupe → parse → validate → review queue) → ListingValidator (completeness score + spam screen) → IngestionWorker (config-gated hosted service). RawListing gains ContentHash/Confidence/ValidationNotes; RawListingStatus.Flagged. Admin /Admin gets run-now, source list, confidence + flagged queue. Hour-range viz: _HourBar 24h timeline bar (colored by type, overnight wrap) on shift cards, recommendation cards, and detail. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,42 @@
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
|
||||
public class DivarOptions
|
||||
{
|
||||
public bool Enabled { get; set; }
|
||||
public string? City { get; set; } // e.g. "tehran"
|
||||
public List<string> Queries { get; set; } = new(); // search terms, e.g. "استخدام پزشک"
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Divar source. Credential-ready: configure city + queries in (Ingestion:Divar) and implement
|
||||
/// the fetch against Divar's listing API/HTML. Dormant until enabled.
|
||||
/// </summary>
|
||||
public class DivarListingSource : IListingSource
|
||||
{
|
||||
private readonly DivarOptions _opts;
|
||||
private readonly ILogger<DivarListingSource> _log;
|
||||
|
||||
public DivarListingSource(IOptions<DivarOptions> opts, ILogger<DivarListingSource> log)
|
||||
{
|
||||
_opts = opts.Value;
|
||||
_log = log;
|
||||
}
|
||||
|
||||
public string Name => "دیوار";
|
||||
public bool Enabled => _opts.Enabled && _opts.Queries.Count > 0;
|
||||
|
||||
public Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default)
|
||||
{
|
||||
if (!Enabled)
|
||||
{
|
||||
_log.LogInformation("Divar source not configured — skipping.");
|
||||
return Task.FromResult<IReadOnlyList<ScrapedItem>>(Array.Empty<ScrapedItem>());
|
||||
}
|
||||
// TODO(prod): query Divar for each term in the configured city, map each ad's
|
||||
// title+description to new ScrapedItem(Name, text, adUrl).
|
||||
_log.LogWarning("Divar fetch not yet implemented; returning empty.");
|
||||
return Task.FromResult<IReadOnlyList<ScrapedItem>>(Array.Empty<ScrapedItem>());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
|
||||
/// <summary>One raw post pulled from a source (a Telegram message, a Divar ad, etc.).</summary>
|
||||
public record ScrapedItem(string Source, string RawText, string? SourceUrl = null);
|
||||
|
||||
/// <summary>
|
||||
/// A pluggable source the ingestion engine pulls from. Implement once per channel/site.
|
||||
/// `Enabled` lets a source be present but dormant until it's configured with credentials.
|
||||
/// </summary>
|
||||
public interface IListingSource
|
||||
{
|
||||
string Name { get; }
|
||||
bool Enabled { get; }
|
||||
Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default);
|
||||
}
|
||||
@@ -0,0 +1,107 @@
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
using JobsMedical.Web.Data;
|
||||
using JobsMedical.Web.Models;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
|
||||
public record SourceResult(string Source, int Fetched, int Queued, int Flagged, int Spam, int Duplicates);
|
||||
|
||||
public record IngestionSummary(List<SourceResult> Sources)
|
||||
{
|
||||
public int TotalQueued => Sources.Sum(s => s.Queued);
|
||||
public int TotalFlagged => Sources.Sum(s => s.Flagged);
|
||||
public int TotalSpam => Sources.Sum(s => s.Spam);
|
||||
public int TotalDuplicates => Sources.Sum(s => s.Duplicates);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The scrape engine. Pulls from every enabled <see cref="IListingSource"/>, dedupes by content
|
||||
/// hash, parses with <see cref="IListingParser"/>, validates with <see cref="ListingValidator"/>,
|
||||
/// and stores each as a <see cref="RawListing"/> with a status: New (queued for review),
|
||||
/// Flagged (incomplete/suspicious), or Discarded (spam). Source-agnostic — add a source and it
|
||||
/// flows through unchanged.
|
||||
/// </summary>
|
||||
public class IngestionService
|
||||
{
|
||||
private readonly AppDbContext _db;
|
||||
private readonly IEnumerable<IListingSource> _sources;
|
||||
private readonly IListingParser _parser;
|
||||
private readonly ListingValidator _validator;
|
||||
private readonly ILogger<IngestionService> _log;
|
||||
|
||||
public IngestionService(AppDbContext db, IEnumerable<IListingSource> sources,
|
||||
IListingParser parser, ListingValidator validator, ILogger<IngestionService> log)
|
||||
{
|
||||
_db = db;
|
||||
_sources = sources;
|
||||
_parser = parser;
|
||||
_validator = validator;
|
||||
_log = log;
|
||||
}
|
||||
|
||||
public IReadOnlyList<(string Name, bool Enabled)> Sources =>
|
||||
_sources.Select(s => (s.Name, s.Enabled)).ToList();
|
||||
|
||||
public async Task<IngestionSummary> RunAsync(CancellationToken ct = default)
|
||||
{
|
||||
var roles = await _db.Roles.Select(r => r.Name).ToListAsync(ct);
|
||||
var cities = await _db.Cities.Select(c => c.Name).ToListAsync(ct);
|
||||
var districts = await _db.Districts.Select(d => d.Name).ToListAsync(ct);
|
||||
|
||||
var results = new List<SourceResult>();
|
||||
|
||||
foreach (var source in _sources.Where(s => s.Enabled))
|
||||
{
|
||||
int fetched = 0, queued = 0, flagged = 0, spam = 0, dupes = 0;
|
||||
IReadOnlyList<ScrapedItem> items;
|
||||
try { items = await source.FetchAsync(ct); }
|
||||
catch (Exception ex) { _log.LogError(ex, "Source {Source} fetch failed", source.Name); continue; }
|
||||
|
||||
foreach (var item in items)
|
||||
{
|
||||
fetched++;
|
||||
var hash = Hash(item.RawText);
|
||||
if (await _db.RawListings.AnyAsync(r => r.ContentHash == hash, ct)) { dupes++; continue; }
|
||||
|
||||
var parsed = _parser.Parse(item.RawText, roles, cities, districts);
|
||||
var val = _validator.Validate(item.RawText, parsed);
|
||||
|
||||
var status = val.IsSpam ? RawListingStatus.Discarded
|
||||
: val.IsValid ? RawListingStatus.New
|
||||
: RawListingStatus.Flagged;
|
||||
if (status == RawListingStatus.New) queued++;
|
||||
else if (status == RawListingStatus.Flagged) flagged++;
|
||||
else spam++;
|
||||
|
||||
_db.RawListings.Add(new RawListing
|
||||
{
|
||||
SourceChannel = item.Source,
|
||||
SourceUrl = item.SourceUrl,
|
||||
RawText = item.RawText.Trim(),
|
||||
ContentHash = hash,
|
||||
Confidence = val.Confidence,
|
||||
ValidationNotes = val.Issues.Count > 0 ? string.Join("؛ ", val.Issues) : null,
|
||||
Status = status,
|
||||
});
|
||||
}
|
||||
|
||||
await _db.SaveChangesAsync(ct);
|
||||
results.Add(new SourceResult(source.Name, fetched, queued, flagged, spam, dupes));
|
||||
_log.LogInformation("Ingestion {Source}: fetched={F} queued={Q} flagged={Fl} spam={S} dupes={D}",
|
||||
source.Name, fetched, queued, flagged, spam, dupes);
|
||||
}
|
||||
|
||||
return new IngestionSummary(results);
|
||||
}
|
||||
|
||||
/// <summary>SHA-256 hex of the whitespace-normalized text (for cross-run dedupe).</summary>
|
||||
private static string Hash(string text)
|
||||
{
|
||||
var normalized = Regex.Replace((text ?? "").Trim(), @"\s+", " ");
|
||||
var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(normalized));
|
||||
return Convert.ToHexString(bytes).ToLowerInvariant();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
|
||||
public class IngestionOptions
|
||||
{
|
||||
public bool Enabled { get; set; } = false; // off by default — opt in via config
|
||||
public int IntervalMinutes { get; set; } = 30;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Periodically runs the ingestion engine when enabled (Ingestion:Enabled=true). Off by default
|
||||
/// so nothing scrapes uninvited; admins can also trigger a run on demand from the admin UI.
|
||||
/// </summary>
|
||||
public class IngestionWorker : BackgroundService
|
||||
{
|
||||
private readonly IServiceScopeFactory _scopes;
|
||||
private readonly IngestionOptions _opts;
|
||||
private readonly ILogger<IngestionWorker> _log;
|
||||
|
||||
public IngestionWorker(IServiceScopeFactory scopes, IOptions<IngestionOptions> opts,
|
||||
ILogger<IngestionWorker> log)
|
||||
{
|
||||
_scopes = scopes;
|
||||
_opts = opts.Value;
|
||||
_log = log;
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
if (!_opts.Enabled)
|
||||
{
|
||||
_log.LogInformation("Ingestion worker disabled (Ingestion:Enabled=false).");
|
||||
return;
|
||||
}
|
||||
|
||||
var interval = TimeSpan.FromMinutes(Math.Max(1, _opts.IntervalMinutes));
|
||||
_log.LogInformation("Ingestion worker on; every {Min} min.", _opts.IntervalMinutes);
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
using var scope = _scopes.CreateScope();
|
||||
var svc = scope.ServiceProvider.GetRequiredService<IngestionService>();
|
||||
var summary = await svc.RunAsync(stoppingToken);
|
||||
_log.LogInformation("Scheduled ingestion: queued={Q} flagged={F} spam={S} dupes={D}",
|
||||
summary.TotalQueued, summary.TotalFlagged, summary.TotalSpam, summary.TotalDuplicates);
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
_log.LogError(ex, "Scheduled ingestion run failed");
|
||||
}
|
||||
|
||||
try { await Task.Delay(interval, stoppingToken); }
|
||||
catch (OperationCanceledException) { break; }
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,63 @@
|
||||
using System.Text.RegularExpressions;
|
||||
using JobsMedical.Web.Models;
|
||||
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
|
||||
public record ValidationResult(bool IsValid, bool IsSpam, int Confidence, List<string> Issues);
|
||||
|
||||
/// <summary>
|
||||
/// Scores a parsed listing for completeness and screens out spam. A listing must look like a
|
||||
/// real medical shift/job (role + a location or pay signal, plausible length, contact) to pass.
|
||||
/// The confidence drives whether it lands in the review queue (New), gets Flagged for a closer
|
||||
/// look, or is auto-discarded as spam.
|
||||
/// </summary>
|
||||
public class ListingValidator
|
||||
{
|
||||
// Posts that smell like ads/scams rather than medical shifts.
|
||||
private static readonly string[] SpamMarkers =
|
||||
{
|
||||
"سرمایه گذاری", "سرمایهگذاری", "وام", "ارز دیجیتال", "رمز ارز", "فروش فالوور",
|
||||
"بک لینک", "تبلیغات", "قرعه کشی", "جایزه", "کازینو", "شرط بندی", "بیت کوین"
|
||||
};
|
||||
|
||||
private static readonly string[] MedicalMarkers =
|
||||
{
|
||||
"شیفت", "درمانگاه", "بیمارستان", "کلینیک", "پزشک", "پرستار", "ماما", "تکنسین",
|
||||
"اورژانس", "استخدام", "کادر درمان", "مطب", "آنکال", "کشیک"
|
||||
};
|
||||
|
||||
public ValidationResult Validate(string rawText, ParsedListing parsed)
|
||||
{
|
||||
var issues = new List<string>();
|
||||
var text = rawText ?? "";
|
||||
|
||||
bool isSpam = SpamMarkers.Any(text.Contains)
|
||||
&& !MedicalMarkers.Any(text.Contains);
|
||||
if (isSpam) issues.Add("بهنظر اسپم/تبلیغاتی است");
|
||||
|
||||
bool looksMedical = MedicalMarkers.Any(text.Contains);
|
||||
if (!looksMedical) issues.Add("نشانهای از حوزه درمان یافت نشد");
|
||||
|
||||
int score = 0;
|
||||
if (parsed.RoleName is not null) score += 30; else issues.Add("نقش مشخص نیست");
|
||||
if (parsed.CityName is not null || parsed.DistrictName is not null) score += 20;
|
||||
else issues.Add("شهر/محل مشخص نیست");
|
||||
if (parsed.PayAmount is not null || parsed.SharePercent is not null || parsed.PayNegotiable)
|
||||
score += 20; else issues.Add("اطلاعات پرداخت یافت نشد");
|
||||
if (parsed.Phone is not null) score += 15; else issues.Add("شماره تماس یافت نشد");
|
||||
if (parsed.Kind == ListingKind.Shift && parsed.ShiftType is not null) score += 10;
|
||||
if (looksMedical) score += 5;
|
||||
|
||||
// Sanity on length — a few words isn't a real listing; a wall of text is suspicious.
|
||||
var len = text.Trim().Length;
|
||||
if (len < 25) { score -= 20; issues.Add("متن خیلی کوتاه است"); }
|
||||
if (len > 1500) { score -= 10; issues.Add("متن غیرعادی بلند است"); }
|
||||
if (Regex.Matches(text, @"https?://").Count >= 3) { score -= 15; issues.Add("لینکهای متعدد"); }
|
||||
|
||||
score = Math.Clamp(score, 0, 100);
|
||||
|
||||
// Valid enough for the queue if it's medical, not spam, and reasonably complete.
|
||||
bool isValid = !isSpam && looksMedical && score >= 50;
|
||||
return new ValidationResult(isValid, isSpam, score, issues);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
|
||||
/// <summary>
|
||||
/// A built-in source of representative Persian posts (the kind found in shift channels). Always
|
||||
/// available, needs no credentials — it lets the whole ingestion → validation → review pipeline
|
||||
/// run and be demoed today, and doubles as a fixture mix of good, incomplete, and spam posts.
|
||||
/// </summary>
|
||||
public class SampleListingSource : IListingSource
|
||||
{
|
||||
public string Name => "نمونه (کانال آزمایشی)";
|
||||
public bool Enabled => true;
|
||||
|
||||
private static readonly string[] Posts =
|
||||
{
|
||||
"درمانگاه شبانهروزی در سعادتآباد نیازمند پزشک عمومی برای شیفت شب، کارانه ۳ میلیون تومان. تماس ۰۹۱۲۳۴۵۶۷۸۹",
|
||||
"کلینیک تخصصی در تهران به پرستار برای شیفت عصر نیازمند است، ۵۰٪ سهم درآمد. ۰۹۳۵۱۱۱۲۲۳۳",
|
||||
"استخدام ماما تماموقت در بیمارستان خصوصی، حقوق توافقی. منطقه شهرک غرب.",
|
||||
"نیازمند تکنسین اتاق عمل جهت همکاری در نارمک، شیفت صبح. ۰۹۱۲۰۰۰۰۰۰۰",
|
||||
"فروش فالوور و بک لینک ارزان، سرمایه گذاری در ارز دیجیتال با سود تضمینی!", // spam
|
||||
"پزشک", // too short / incomplete
|
||||
"بیمارستان آتیه جهت تکمیل کادر درمان به پزشک عمومی مقیم نیازمند است. قرارداد یکساله، حقوق ۴۵ میلیون ماهانه. تهرانپارس.",
|
||||
};
|
||||
|
||||
public Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default)
|
||||
=> Task.FromResult<IReadOnlyList<ScrapedItem>>(
|
||||
Posts.Select(p => new ScrapedItem(Name, p)).ToList());
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
|
||||
public class TelegramOptions
|
||||
{
|
||||
public bool Enabled { get; set; }
|
||||
public string? BotToken { get; set; }
|
||||
public List<string> Channels { get; set; } = new(); // @channel handles to read
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Telegram/Bale channel source. Credential-ready: wire a bot token + channel list in config
|
||||
/// (Ingestion:Telegram) and implement the fetch against the Bot API (getUpdates / channel posts)
|
||||
/// or a userbot. Dormant until enabled, so the engine runs without it.
|
||||
/// </summary>
|
||||
public class TelegramListingSource : IListingSource
|
||||
{
|
||||
private readonly TelegramOptions _opts;
|
||||
private readonly ILogger<TelegramListingSource> _log;
|
||||
|
||||
public TelegramListingSource(IOptions<TelegramOptions> opts, ILogger<TelegramListingSource> log)
|
||||
{
|
||||
_opts = opts.Value;
|
||||
_log = log;
|
||||
}
|
||||
|
||||
public string Name => "تلگرام/بله";
|
||||
public bool Enabled => _opts.Enabled && !string.IsNullOrWhiteSpace(_opts.BotToken) && _opts.Channels.Count > 0;
|
||||
|
||||
public Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default)
|
||||
{
|
||||
if (!Enabled)
|
||||
{
|
||||
_log.LogInformation("Telegram source not configured — skipping.");
|
||||
return Task.FromResult<IReadOnlyList<ScrapedItem>>(Array.Empty<ScrapedItem>());
|
||||
}
|
||||
// TODO(prod): call https://api.telegram.org/bot{token}/getUpdates (or channel history),
|
||||
// map each message to new ScrapedItem(Name, message.Text, messageLink). The validation +
|
||||
// dedupe pipeline downstream is already source-agnostic.
|
||||
_log.LogWarning("Telegram fetch not yet implemented; returning empty.");
|
||||
return Task.FromResult<IReadOnlyList<ScrapedItem>>(Array.Empty<ScrapedItem>());
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user