Add scrape/ingestion engine + validation, and 24h shift hour-range visualization

Scrape engine (Services/Scraping/): pluggable IListingSource (working sample + Telegram/Divar credential-ready stubs) → IngestionService (content-hash dedupe → parse → validate → review queue) → ListingValidator (completeness score + spam screen) → IngestionWorker (config-gated hosted service). RawListing gains ContentHash/Confidence/ValidationNotes; RawListingStatus.Flagged. Admin /Admin gets run-now, source list, confidence + flagged queue. Hour-range viz: _HourBar 24h timeline bar (colored by type, overnight wrap) on shift cards, recommendation cards, and detail. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-03 08:18:19 +03:30
parent 69fa921fbd
commit 931b7b6ffb
24 changed files with 1439 additions and 26 deletions
@@ -0,0 +1,63 @@
+using System.Text.RegularExpressions;
+using JobsMedical.Web.Models;
+
+namespace JobsMedical.Web.Services.Scraping;
+
+public record ValidationResult(bool IsValid, bool IsSpam, int Confidence, List<string> Issues);
+
+/// <summary>
+/// Scores a parsed listing for completeness and screens out spam. A listing must look like a
+/// real medical shift/job (role + a location or pay signal, plausible length, contact) to pass.
+/// The confidence drives whether it lands in the review queue (New), gets Flagged for a closer
+/// look, or is auto-discarded as spam.
+/// </summary>
+public class ListingValidator
+{
+    // Posts that smell like ads/scams rather than medical shifts.
+    private static readonly string[] SpamMarkers =
+    {
+        "سرمایه گذاری", "سرمایه‌گذاری", "وام", "ارز دیجیتال", "رمز ارز", "فروش فالوور",
+        "بک لینک", "تبلیغات", "قرعه کشی", "جایزه", "کازینو", "شرط بندی", "بیت کوین"
+    };
+
+    private static readonly string[] MedicalMarkers =
+    {
+        "شیفت", "درمانگاه", "بیمارستان", "کلینیک", "پزشک", "پرستار", "ماما", "تکنسین",
+        "اورژانس", "استخدام", "کادر درمان", "مطب", "آنکال", "کشیک"
+    };
+
+    public ValidationResult Validate(string rawText, ParsedListing parsed)
+    {
+        var issues = new List<string>();
+        var text = rawText ?? "";
+
+        bool isSpam = SpamMarkers.Any(text.Contains)
+                      && !MedicalMarkers.Any(text.Contains);
+        if (isSpam) issues.Add("به‌نظر اسپم/تبلیغاتی است");
+
+        bool looksMedical = MedicalMarkers.Any(text.Contains);
+        if (!looksMedical) issues.Add("نشانه‌ای از حوزه درمان یافت نشد");
+
+        int score = 0;
+        if (parsed.RoleName is not null) score += 30; else issues.Add("نقش مشخص نیست");
+        if (parsed.CityName is not null || parsed.DistrictName is not null) score += 20;
+            else issues.Add("شهر/محل مشخص نیست");
+        if (parsed.PayAmount is not null || parsed.SharePercent is not null || parsed.PayNegotiable)
+            score += 20; else issues.Add("اطلاعات پرداخت یافت نشد");
+        if (parsed.Phone is not null) score += 15; else issues.Add("شماره تماس یافت نشد");
+        if (parsed.Kind == ListingKind.Shift && parsed.ShiftType is not null) score += 10;
+        if (looksMedical) score += 5;
+
+        // Sanity on length — a few words isn't a real listing; a wall of text is suspicious.
+        var len = text.Trim().Length;
+        if (len < 25) { score -= 20; issues.Add("متن خیلی کوتاه است"); }
+        if (len > 1500) { score -= 10; issues.Add("متن غیرعادی بلند است"); }
+        if (Regex.Matches(text, @"https?://").Count >= 3) { score -= 15; issues.Add("لینک‌های متعدد"); }
+
+        score = Math.Clamp(score, 0, 100);
+
+        // Valid enough for the queue if it's medical, not spam, and reasonably complete.
+        bool isValid = !isSpam && looksMedical && score >= 50;
+        return new ValidationResult(isValid, isSpam, score, issues);
+    }
+}