using System.Text.RegularExpressions; using JobsMedical.Web.Models; namespace JobsMedical.Web.Services.Scraping; public record ValidationResult(bool IsValid, bool IsSpam, int Confidence, List Issues); /// /// Scores a parsed listing for completeness and screens out spam. A listing must look like a /// real medical shift/job (role + a location or pay signal, plausible length, contact) to pass. /// The confidence drives whether it lands in the review queue (New), gets Flagged for a closer /// look, or is auto-discarded as spam. /// public class ListingValidator { // Posts that smell like ads/scams rather than medical shifts. private static readonly string[] SpamMarkers = { "سرمایه گذاری", "سرمایه‌گذاری", "وام", "ارز دیجیتال", "رمز ارز", "فروش فالوور", "بک لینک", "تبلیغات", "قرعه کشی", "جایزه", "کازینو", "شرط بندی", "بیت کوین" }; private static readonly string[] MedicalMarkers = { "شیفت", "درمانگاه", "بیمارستان", "کلینیک", "پزشک", "پرستار", "ماما", "تکنسین", "اورژانس", "استخدام", "کادر درمان", "مطب", "آنکال", "کشیک" }; public ValidationResult Validate(string rawText, ParsedListing parsed) { var issues = new List(); var text = rawText ?? ""; bool isSpam = SpamMarkers.Any(text.Contains) && !MedicalMarkers.Any(text.Contains); if (isSpam) issues.Add("به‌نظر اسپم/تبلیغاتی است"); bool looksMedical = MedicalMarkers.Any(text.Contains); if (!looksMedical) issues.Add("نشانه‌ای از حوزه درمان یافت نشد"); int score = 0; if (parsed.RoleName is not null) score += 30; else issues.Add("نقش مشخص نیست"); if (parsed.CityName is not null || parsed.DistrictName is not null) score += 20; else issues.Add("شهر/محل مشخص نیست"); if (parsed.PayAmount is not null || parsed.SharePercent is not null || parsed.PayNegotiable) score += 20; else issues.Add("اطلاعات پرداخت یافت نشد"); if (parsed.Phone is not null) score += 15; else issues.Add("شماره تماس یافت نشد"); if (parsed.Kind == ListingKind.Shift && parsed.ShiftType is not null) score += 10; if (looksMedical) score += 5; // Sanity on length — a few words isn't a real listing; a wall of text is suspicious. var len = text.Trim().Length; if (len < 25) { score -= 20; issues.Add("متن خیلی کوتاه است"); } if (len > 1500) { score -= 10; issues.Add("متن غیرعادی بلند است"); } if (Regex.Matches(text, @"https?://").Count >= 3) { score -= 15; issues.Add("لینک‌های متعدد"); } score = Math.Clamp(score, 0, 100); // Valid enough for the queue if it's medical, not spam, and reasonably complete. bool isValid = !isSpam && looksMedical && score >= 50; return new ValidationResult(isValid, isSpam, score, issues); } }