Files
hamkadr/src/JobsMedical.Web/Services/Scraping/ListingValidator.cs
T

86 lines
4.8 KiB
C#
Raw Normal View History

using System.Text.RegularExpressions;
using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services.Scraping;
public record ValidationResult(bool IsValid, bool IsSpam, int Confidence, List<string> Issues);
/// <summary>
/// Scores a parsed listing for completeness and screens out spam. A listing must look like a
/// real medical shift/job (role + a location or pay signal, plausible length, contact) to pass.
/// The confidence drives whether it lands in the review queue (New), gets Flagged for a closer
/// look, or is auto-discarded as spam.
/// </summary>
public class ListingValidator
{
// Posts that smell like ads/scams rather than medical shifts.
private static readonly string[] SpamMarkers =
{
"سرمایه گذاری", "سرمایه‌گذاری", "وام", "ارز دیجیتال", "رمز ارز", "فروش فالوور",
"بک لینک", "تبلیغات", "قرعه کشی", "جایزه", "کازینو", "شرط بندی", "بیت کوین"
};
// Clinical/health markers ONLY. Deliberately excludes generic words like «استخدام» and «شیفت»
// (they match retail/restaurant ads on Divar). A post must contain a real care-domain term.
private static readonly string[] MedicalMarkers =
{
"درمانگاه", "بیمارستان", "کلینیک", "مطب", "اورژانس", "کادر درمان", "پلی کلینیک",
"پزشک", "دکتر", "پرستار", "بهیار", "کمک بهیار", "کمک پرستار", "بیماربر",
"ماما", "مامایی", "تکنسین", "اتاق عمل", "بیهوشی", "رادیولوژی", "سونوگرافی",
"آزمایشگاه", "تزریقات", "پانسمان", "فیزیوتراپ", "دندان", "داروخانه", "داروساز",
"دیالیز", "فوریت", "آی سی یو", "سی سی یو", "آنکال", "کشیک", "تریاژ", "نوزادان", "سالمند"
};
public ValidationResult Validate(string rawText, ParsedListing parsed)
{
var issues = new List<string>();
var text = rawText ?? "";
bool isSpam = SpamMarkers.Any(text.Contains)
&& !MedicalMarkers.Any(text.Contains);
if (isSpam) issues.Add("به‌نظر اسپم/تبلیغاتی است");
bool looksMedical = MedicalMarkers.Any(text.Contains);
if (!looksMedical) issues.Add("نشانه‌ای از حوزه درمان یافت نشد");
// «آماده به کار»: a worker offering themselves. No facility/shift-date expected; the role
// and a contact number are what matter.
if (parsed.Kind == ListingKind.Talent)
{
int ts = 0;
if (parsed.RoleName is not null) ts += 35; else issues.Add("نقش/رشته مشخص نیست");
if (parsed.Phone is not null) ts += 30; else issues.Add("شماره تماس یافت نشد");
if (parsed.CityName is not null || parsed.DistrictName is not null || parsed.AreaNote is not null) ts += 15;
if (parsed.YearsExperience is not null || parsed.IsLicensed) ts += 10;
if (looksMedical) ts += 10;
var tlen = text.Trim().Length;
if (tlen < 20) { ts -= 20; issues.Add("متن خیلی کوتاه است"); }
ts = Math.Clamp(ts, 0, 100);
bool tValid = !isSpam && looksMedical && ts >= 50;
return new ValidationResult(tValid, isSpam, ts, issues);
}
int score = 0;
if (parsed.RoleName is not null) score += 30; else issues.Add("نقش مشخص نیست");
if (parsed.CityName is not null || parsed.DistrictName is not null) score += 20;
else issues.Add("شهر/محل مشخص نیست");
if (parsed.PayAmount is not null || parsed.SharePercent is not null || parsed.PayNegotiable)
score += 20; else issues.Add("اطلاعات پرداخت یافت نشد");
if (parsed.Phone is not null) score += 15; else issues.Add("شماره تماس یافت نشد");
if (parsed.Kind == ListingKind.Shift && parsed.ShiftType is not null) score += 10;
if (looksMedical) score += 5;
// Sanity on length — a few words isn't a real listing; a wall of text is suspicious.
var len = text.Trim().Length;
if (len < 25) { score -= 20; issues.Add("متن خیلی کوتاه است"); }
if (len > 1500) { score -= 10; issues.Add("متن غیرعادی بلند است"); }
if (Regex.Matches(text, @"https?://").Count >= 3) { score -= 15; issues.Add("لینک‌های متعدد"); }
score = Math.Clamp(score, 0, 100);
// Valid enough for the queue if it's medical, not spam, and reasonably complete.
bool isValid = !isSpam && looksMedical && score >= 50;
return new ValidationResult(isValid, isSpam, score, issues);
}
}