2026-06-03 08:18:19 +03:30
|
|
|
using System.Text.RegularExpressions;
|
|
|
|
|
using JobsMedical.Web.Models;
|
|
|
|
|
|
|
|
|
|
namespace JobsMedical.Web.Services.Scraping;
|
|
|
|
|
|
|
|
|
|
public record ValidationResult(bool IsValid, bool IsSpam, int Confidence, List<string> Issues);
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
/// Scores a parsed listing for completeness and screens out spam. A listing must look like a
|
|
|
|
|
/// real medical shift/job (role + a location or pay signal, plausible length, contact) to pass.
|
|
|
|
|
/// The confidence drives whether it lands in the review queue (New), gets Flagged for a closer
|
|
|
|
|
/// look, or is auto-discarded as spam.
|
|
|
|
|
/// </summary>
|
|
|
|
|
public class ListingValidator
|
|
|
|
|
{
|
|
|
|
|
// Posts that smell like ads/scams rather than medical shifts.
|
|
|
|
|
private static readonly string[] SpamMarkers =
|
|
|
|
|
{
|
|
|
|
|
"سرمایه گذاری", "سرمایهگذاری", "وام", "ارز دیجیتال", "رمز ارز", "فروش فالوور",
|
|
|
|
|
"بک لینک", "تبلیغات", "قرعه کشی", "جایزه", "کازینو", "شرط بندی", "بیت کوین"
|
|
|
|
|
};
|
|
|
|
|
|
2026-06-07 22:34:05 +03:30
|
|
|
// Clinical/health markers ONLY. Deliberately excludes generic words like «استخدام» and «شیفت»
|
|
|
|
|
// (they match retail/restaurant ads on Divar). A post must contain a real care-domain term.
|
2026-06-03 08:18:19 +03:30
|
|
|
private static readonly string[] MedicalMarkers =
|
|
|
|
|
{
|
2026-06-07 22:34:05 +03:30
|
|
|
"درمانگاه", "بیمارستان", "کلینیک", "مطب", "اورژانس", "کادر درمان", "پلی کلینیک",
|
|
|
|
|
"پزشک", "دکتر", "پرستار", "بهیار", "کمک بهیار", "کمک پرستار", "بیماربر",
|
|
|
|
|
"ماما", "مامایی", "تکنسین", "اتاق عمل", "بیهوشی", "رادیولوژی", "سونوگرافی",
|
|
|
|
|
"آزمایشگاه", "تزریقات", "پانسمان", "فیزیوتراپ", "دندان", "داروخانه", "داروساز",
|
|
|
|
|
"دیالیز", "فوریت", "آی سی یو", "سی سی یو", "آنکال", "کشیک", "تریاژ", "نوزادان", "سالمند"
|
2026-06-03 08:18:19 +03:30
|
|
|
};
|
|
|
|
|
|
2026-06-08 09:30:23 +03:30
|
|
|
// Course/event/product ads aimed at clinicians — not job posts.
|
|
|
|
|
private static readonly string[] PromoMarkers =
|
|
|
|
|
{
|
|
|
|
|
"کارگاه", "وبینار", "سمینار", "همایش", "کنگره", "دوره آموزشی", "دورهی آموزشی",
|
|
|
|
|
"ثبت نام", "ثبتنام", "ظرفیت محدود", "فروش دوره", "مدرک معتبر", "گواهی پایان دوره",
|
|
|
|
|
"بوتاکس و فیلر", "مزوتراپی", "فیلر صورت",
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// Words that signal a real staffing post (hiring, shift, or availability).
|
|
|
|
|
private static readonly string[] StaffingIntent =
|
|
|
|
|
{
|
|
|
|
|
"استخدام", "جذب", "نیازمند", "نیازمندیم", "دعوت به همکاری", "شیفت", "آنکال", "انکال",
|
|
|
|
|
"کشیک", "نوبت", "آماده به کار", "آمادهبهکار", "آماده همکاری", "جویای کار", "مسئول فنی",
|
|
|
|
|
};
|
|
|
|
|
|
2026-06-03 08:18:19 +03:30
|
|
|
public ValidationResult Validate(string rawText, ParsedListing parsed)
|
|
|
|
|
{
|
|
|
|
|
var issues = new List<string>();
|
|
|
|
|
var text = rawText ?? "";
|
|
|
|
|
|
|
|
|
|
bool isSpam = SpamMarkers.Any(text.Contains)
|
|
|
|
|
&& !MedicalMarkers.Any(text.Contains);
|
|
|
|
|
if (isSpam) issues.Add("بهنظر اسپم/تبلیغاتی است");
|
|
|
|
|
|
|
|
|
|
bool looksMedical = MedicalMarkers.Any(text.Contains);
|
|
|
|
|
if (!looksMedical) issues.Add("نشانهای از حوزه درمان یافت نشد");
|
|
|
|
|
|
2026-06-08 09:30:23 +03:30
|
|
|
// Promotional / training ads (workshops, webinars, course/product sales) are medical-
|
|
|
|
|
// flavored but NOT staffing. Discard them when there's no hiring/shift/availability intent.
|
|
|
|
|
bool isPromo = PromoMarkers.Any(text.Contains) && !StaffingIntent.Any(text.Contains);
|
|
|
|
|
if (isPromo)
|
|
|
|
|
{
|
|
|
|
|
issues.Add("آگهی تبلیغاتی/آموزشی است، نه استخدام/شیفت");
|
|
|
|
|
return new ValidationResult(false, true, 0, issues); // IsSpam → auto-discard
|
|
|
|
|
}
|
|
|
|
|
|
2026-06-08 08:01:12 +03:30
|
|
|
// «آماده به کار»: a worker offering themselves. No facility/shift-date expected; the role
|
|
|
|
|
// and a contact number are what matter.
|
|
|
|
|
if (parsed.Kind == ListingKind.Talent)
|
|
|
|
|
{
|
|
|
|
|
int ts = 0;
|
|
|
|
|
if (parsed.RoleName is not null) ts += 35; else issues.Add("نقش/رشته مشخص نیست");
|
|
|
|
|
if (parsed.Phone is not null) ts += 30; else issues.Add("شماره تماس یافت نشد");
|
|
|
|
|
if (parsed.CityName is not null || parsed.DistrictName is not null || parsed.AreaNote is not null) ts += 15;
|
|
|
|
|
if (parsed.YearsExperience is not null || parsed.IsLicensed) ts += 10;
|
|
|
|
|
if (looksMedical) ts += 10;
|
|
|
|
|
var tlen = text.Trim().Length;
|
|
|
|
|
if (tlen < 20) { ts -= 20; issues.Add("متن خیلی کوتاه است"); }
|
|
|
|
|
ts = Math.Clamp(ts, 0, 100);
|
|
|
|
|
bool tValid = !isSpam && looksMedical && ts >= 50;
|
|
|
|
|
return new ValidationResult(tValid, isSpam, ts, issues);
|
|
|
|
|
}
|
|
|
|
|
|
2026-06-03 08:18:19 +03:30
|
|
|
int score = 0;
|
|
|
|
|
if (parsed.RoleName is not null) score += 30; else issues.Add("نقش مشخص نیست");
|
|
|
|
|
if (parsed.CityName is not null || parsed.DistrictName is not null) score += 20;
|
|
|
|
|
else issues.Add("شهر/محل مشخص نیست");
|
|
|
|
|
if (parsed.PayAmount is not null || parsed.SharePercent is not null || parsed.PayNegotiable)
|
|
|
|
|
score += 20; else issues.Add("اطلاعات پرداخت یافت نشد");
|
|
|
|
|
if (parsed.Phone is not null) score += 15; else issues.Add("شماره تماس یافت نشد");
|
|
|
|
|
if (parsed.Kind == ListingKind.Shift && parsed.ShiftType is not null) score += 10;
|
|
|
|
|
if (looksMedical) score += 5;
|
|
|
|
|
|
|
|
|
|
// Sanity on length — a few words isn't a real listing; a wall of text is suspicious.
|
|
|
|
|
var len = text.Trim().Length;
|
|
|
|
|
if (len < 25) { score -= 20; issues.Add("متن خیلی کوتاه است"); }
|
|
|
|
|
if (len > 1500) { score -= 10; issues.Add("متن غیرعادی بلند است"); }
|
|
|
|
|
if (Regex.Matches(text, @"https?://").Count >= 3) { score -= 15; issues.Add("لینکهای متعدد"); }
|
|
|
|
|
|
|
|
|
|
score = Math.Clamp(score, 0, 100);
|
|
|
|
|
|
|
|
|
|
// Valid enough for the queue if it's medical, not spam, and reasonably complete.
|
|
|
|
|
bool isValid = !isSpam && looksMedical && score >= 50;
|
|
|
|
|
return new ValidationResult(isValid, isSpam, score, issues);
|
|
|
|
|
}
|
|
|
|
|
}
|