Add scrape/ingestion engine + validation, and 24h shift hour-range visualization

Scrape engine (Services/Scraping/): pluggable IListingSource (working sample + Telegram/Divar credential-ready stubs) → IngestionService (content-hash dedupe → parse → validate → review queue) → ListingValidator (completeness score + spam screen) → IngestionWorker (config-gated hosted service). RawListing gains ContentHash/Confidence/ValidationNotes; RawListingStatus.Flagged. Admin /Admin gets run-now, source list, confidence + flagged queue.

Hour-range viz: _HourBar 24h timeline bar (colored by type, overnight wrap) on shift cards, recommendation cards, and detail.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-03 08:18:19 +03:30
parent 69fa921fbd
commit 931b7b6ffb
24 changed files with 1439 additions and 26 deletions
@@ -0,0 +1,63 @@
using System.Text.RegularExpressions;
using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services.Scraping;
public record ValidationResult(bool IsValid, bool IsSpam, int Confidence, List<string> Issues);
/// <summary>
/// Scores a parsed listing for completeness and screens out spam. A listing must look like a
/// real medical shift/job (role + a location or pay signal, plausible length, contact) to pass.
/// The confidence drives whether it lands in the review queue (New), gets Flagged for a closer
/// look, or is auto-discarded as spam.
/// </summary>
public class ListingValidator
{
// Posts that smell like ads/scams rather than medical shifts.
private static readonly string[] SpamMarkers =
{
"سرمایه گذاری", "سرمایه‌گذاری", "وام", "ارز دیجیتال", "رمز ارز", "فروش فالوور",
"بک لینک", "تبلیغات", "قرعه کشی", "جایزه", "کازینو", "شرط بندی", "بیت کوین"
};
private static readonly string[] MedicalMarkers =
{
"شیفت", "درمانگاه", "بیمارستان", "کلینیک", "پزشک", "پرستار", "ماما", "تکنسین",
"اورژانس", "استخدام", "کادر درمان", "مطب", "آنکال", "کشیک"
};
public ValidationResult Validate(string rawText, ParsedListing parsed)
{
var issues = new List<string>();
var text = rawText ?? "";
bool isSpam = SpamMarkers.Any(text.Contains)
&& !MedicalMarkers.Any(text.Contains);
if (isSpam) issues.Add("به‌نظر اسپم/تبلیغاتی است");
bool looksMedical = MedicalMarkers.Any(text.Contains);
if (!looksMedical) issues.Add("نشانه‌ای از حوزه درمان یافت نشد");
int score = 0;
if (parsed.RoleName is not null) score += 30; else issues.Add("نقش مشخص نیست");
if (parsed.CityName is not null || parsed.DistrictName is not null) score += 20;
else issues.Add("شهر/محل مشخص نیست");
if (parsed.PayAmount is not null || parsed.SharePercent is not null || parsed.PayNegotiable)
score += 20; else issues.Add("اطلاعات پرداخت یافت نشد");
if (parsed.Phone is not null) score += 15; else issues.Add("شماره تماس یافت نشد");
if (parsed.Kind == ListingKind.Shift && parsed.ShiftType is not null) score += 10;
if (looksMedical) score += 5;
// Sanity on length — a few words isn't a real listing; a wall of text is suspicious.
var len = text.Trim().Length;
if (len < 25) { score -= 20; issues.Add("متن خیلی کوتاه است"); }
if (len > 1500) { score -= 10; issues.Add("متن غیرعادی بلند است"); }
if (Regex.Matches(text, @"https?://").Count >= 3) { score -= 15; issues.Add("لینک‌های متعدد"); }
score = Math.Clamp(score, 0, 100);
// Valid enough for the queue if it's medical, not spam, and reasonably complete.
bool isValid = !isSpam && looksMedical && score >= 50;
return new ValidationResult(isValid, isSpam, score, issues);
}
}