931b7b6ffb
Scrape engine (Services/Scraping/): pluggable IListingSource (working sample + Telegram/Divar credential-ready stubs) → IngestionService (content-hash dedupe → parse → validate → review queue) → ListingValidator (completeness score + spam screen) → IngestionWorker (config-gated hosted service). RawListing gains ContentHash/Confidence/ValidationNotes; RawListingStatus.Flagged. Admin /Admin gets run-now, source list, confidence + flagged queue. Hour-range viz: _HourBar 24h timeline bar (colored by type, overnight wrap) on shift cards, recommendation cards, and detail. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
64 lines
3.2 KiB
C#
64 lines
3.2 KiB
C#
using System.Text.RegularExpressions;
|
|
using JobsMedical.Web.Models;
|
|
|
|
namespace JobsMedical.Web.Services.Scraping;
|
|
|
|
public record ValidationResult(bool IsValid, bool IsSpam, int Confidence, List<string> Issues);
|
|
|
|
/// <summary>
|
|
/// Scores a parsed listing for completeness and screens out spam. A listing must look like a
|
|
/// real medical shift/job (role + a location or pay signal, plausible length, contact) to pass.
|
|
/// The confidence drives whether it lands in the review queue (New), gets Flagged for a closer
|
|
/// look, or is auto-discarded as spam.
|
|
/// </summary>
|
|
public class ListingValidator
|
|
{
|
|
// Posts that smell like ads/scams rather than medical shifts.
|
|
private static readonly string[] SpamMarkers =
|
|
{
|
|
"سرمایه گذاری", "سرمایهگذاری", "وام", "ارز دیجیتال", "رمز ارز", "فروش فالوور",
|
|
"بک لینک", "تبلیغات", "قرعه کشی", "جایزه", "کازینو", "شرط بندی", "بیت کوین"
|
|
};
|
|
|
|
private static readonly string[] MedicalMarkers =
|
|
{
|
|
"شیفت", "درمانگاه", "بیمارستان", "کلینیک", "پزشک", "پرستار", "ماما", "تکنسین",
|
|
"اورژانس", "استخدام", "کادر درمان", "مطب", "آنکال", "کشیک"
|
|
};
|
|
|
|
public ValidationResult Validate(string rawText, ParsedListing parsed)
|
|
{
|
|
var issues = new List<string>();
|
|
var text = rawText ?? "";
|
|
|
|
bool isSpam = SpamMarkers.Any(text.Contains)
|
|
&& !MedicalMarkers.Any(text.Contains);
|
|
if (isSpam) issues.Add("بهنظر اسپم/تبلیغاتی است");
|
|
|
|
bool looksMedical = MedicalMarkers.Any(text.Contains);
|
|
if (!looksMedical) issues.Add("نشانهای از حوزه درمان یافت نشد");
|
|
|
|
int score = 0;
|
|
if (parsed.RoleName is not null) score += 30; else issues.Add("نقش مشخص نیست");
|
|
if (parsed.CityName is not null || parsed.DistrictName is not null) score += 20;
|
|
else issues.Add("شهر/محل مشخص نیست");
|
|
if (parsed.PayAmount is not null || parsed.SharePercent is not null || parsed.PayNegotiable)
|
|
score += 20; else issues.Add("اطلاعات پرداخت یافت نشد");
|
|
if (parsed.Phone is not null) score += 15; else issues.Add("شماره تماس یافت نشد");
|
|
if (parsed.Kind == ListingKind.Shift && parsed.ShiftType is not null) score += 10;
|
|
if (looksMedical) score += 5;
|
|
|
|
// Sanity on length — a few words isn't a real listing; a wall of text is suspicious.
|
|
var len = text.Trim().Length;
|
|
if (len < 25) { score -= 20; issues.Add("متن خیلی کوتاه است"); }
|
|
if (len > 1500) { score -= 10; issues.Add("متن غیرعادی بلند است"); }
|
|
if (Regex.Matches(text, @"https?://").Count >= 3) { score -= 15; issues.Add("لینکهای متعدد"); }
|
|
|
|
score = Math.Clamp(score, 0, 100);
|
|
|
|
// Valid enough for the queue if it's medical, not spam, and reasonably complete.
|
|
bool isValid = !isSpam && looksMedical && score >= 50;
|
|
return new ValidationResult(isValid, isSpam, score, issues);
|
|
}
|
|
}
|