hamkadr/src/JobsMedical.Web/Services/ListingParser.cs

using System.Text.RegularExpressions;
using JobsMedical.Web.Models;

namespace JobsMedical.Web.Services;

/// <summary>One contact channel pulled from a post (type + raw value).</summary>
public record ParsedContact(ContactType Type, string Value);

/// <summary>Structured guess extracted from a raw channel post. All fields are best-effort.</summary>
public class ParsedListing
{
    public ListingKind Kind { get; set; } = ListingKind.Shift;
    public string? RoleName { get; set; }                 // primary role (first match)
    public List<string> RoleNames { get; set; } = new();  // all roles in the ad (e.g. سالمند + کودک)
    public ShiftType? ShiftType { get; set; }
    public EmploymentType? EmploymentType { get; set; }
    public long? PayAmount { get; set; }        // shift pay or single salary figure
    public int? SharePercent { get; set; }      // profit-share % (درصدی / سهم درآمد)
    public bool PayNegotiable { get; set; }
    public Gender Gender { get; set; } = Gender.Any;   // جنسیت مورد نیاز
    public string? CityName { get; set; }
    public string? DistrictName { get; set; }
    public string? FacilityName { get; set; }   // hospital/clinic name guessed from the text
    public string? Phone { get; set; }

    // «آماده به کار» (talent) extras — populated when Kind == Talent.
    public string? PersonName { get; set; }     // «دکتر سپیده علیزاده»
    public int? YearsExperience { get; set; }   // سابقه (سال)
    public bool IsLicensed { get; set; }        // پروانه‌دار
    public string? AreaNote { get; set; }       // «فقط منطقه ۱»
    public List<ParsedContact> Contacts { get; set; } = new();  // phones, email, socials…
    public List<string> Tags { get; set; } = new();             // cert/skill keywords for search
    public List<string> Notes { get; set; } = new();  // what was/wasn't detected (shown to admin)
}

/// <summary>
/// Turns a messy Persian channel/Divar post into a structured listing guess. This is the
/// Stage-1 implementation: transparent keyword + regex heuristics, no AI dependency (important
/// since LLM APIs are blocked from Iran). A future LlmListingParser can implement the same
/// interface and be swapped in via DI without touching the admin queue.
/// </summary>
public interface IListingParser
{
    ParsedListing Parse(string rawText, IEnumerable<string> knownRoles,
                        IEnumerable<string> knownCities, IEnumerable<string> knownDistricts);
}

public class HeuristicListingParser : IListingParser
{
    public ParsedListing Parse(string raw, IEnumerable<string> knownRoles,
        IEnumerable<string> knownCities, IEnumerable<string> knownDistricts)
    {
        var p = new ParsedListing();
        var text = Normalize(raw);

        // --- Kind: talent (worker offers themselves) vs shift vs hiring ---
        // Talent is checked first: «آماده به کار/همکاری», «جویای کار» mean the *person* is
        // available — distinct from an employer's «دعوت به همکاری».
        bool talentSignals = ContainsAny(text,
            "آماده به کار", "آماده‌به‌کار", "آماده همکاری", "آماده‌ی همکاری", "آماده ی همکاری",
            "آماده فعالیت", "جویای کار", "جویای کار هستم", "متقاضی کار", "نیازمند کار",
            "آماده انجام", "می‌توانم همکاری", "میتوانم همکاری", "حاضر به همکاری");
        bool jobSignals = ContainsAny(text, "استخدام", "جذب", "دعوت به همکاری", "نیازمندیم", "نیازمند است", "حقوق ثابت");
        bool shiftSignals = ContainsAny(text, "شیفت", "آنکال", "انکال", "نوبت", "کشیک");
        if (talentSignals)
        {
            p.Kind = ListingKind.Talent;
            p.Notes.Add("نوع: آماده به کار (تشخیص خودکار)");
        }
        else
        {
            p.Kind = (jobSignals && !shiftSignals) ? ListingKind.Job : ListingKind.Shift;
            p.Notes.Add(p.Kind == ListingKind.Job ? "نوع: استخدام (تشخیص خودکار)" : "نوع: شیفت (تشخیص خودکار)");
        }

        // --- Roles (an ad can name several at once: «پرستار سالمند و کودک و همراه بیمار») ---
        var known = knownRoles.ToList();
        var hits = new List<string>();
        // Exact taxonomy matches (longest first so «پزشک متخصص» beats «پزشک»).
        foreach (var role in known.OrderByDescending(r => r.Length))
            if (text.Contains(Normalize(role))) hits.Add(role);
        // Drop a role that's a substring of a longer matched role (پرستار ⊂ پرستار سالمندان).
        hits = hits.Where(r => !hits.Any(o => o != r && o.Contains(r))).Distinct().ToList();

        // Synonyms → canonical role names (covers terms not written verbatim). Only add a canonical
        // that actually exists in the taxonomy, and isn't already a hit.
        void AddSyn(string canonical, params string[] needles)
        {
            if (ContainsAny(text, needles) && known.Contains(canonical) && !hits.Contains(canonical))
                hits.Add(canonical);
        }
        AddSyn("پرستار سالمندان", "سالمند", "سالمندان", "نگهداری سالمند");
        AddSyn("دندانپزشک", "دندان", "دندانپزشک", "دندان‌پزشک");
        AddSyn("تکنسین اتاق عمل", "اتاق عمل", "اسکراب");
        AddSyn("تکنسین فوریت‌های پزشکی", "فوریت", "اورژانس پیش بیمارستانی", "آمبولانس");
        AddSyn("کارشناس آزمایشگاه", "آزمایشگاه", "علوم آزمایشگاهی", "نمونه گیر");
        AddSyn("ماما", "مامایی");
        AddSyn("پرستار", "بهیار", "کمک بهیار", "کمک پرستار", "بیماربر", "مراقب", "همراه بیمار",
                          "کودک", "اطفال", "نوزاد", "تزریقات", "پانسمان");
        AddSyn("پزشک متخصص", "فوق تخصص", "متخصص");
        AddSyn("پزشک عمومی", "پزشک", "دکتر", "طبیب");

        p.RoleNames = hits.Distinct().Take(4).ToList();    // cap fan-out
        p.RoleName = p.RoleNames.FirstOrDefault();
        p.Notes.Add(p.RoleNames.Count == 0 ? "نقش: تشخیص داده نشد" : $"نقش‌ها: {string.Join("، ", p.RoleNames)}");

        // --- Shift type ---
        if (ContainsAny(text, "آنکال", "انکال")) p.ShiftType = Models.ShiftType.OnCall;
        else if (text.Contains("شب")) p.ShiftType = Models.ShiftType.Night;
        else if (text.Contains("عصر")) p.ShiftType = Models.ShiftType.Evening;
        else if (ContainsAny(text, "صبح", "روز")) p.ShiftType = Models.ShiftType.Day;

        // --- Employment type ---
        if (ContainsAny(text, "پاره وقت", "پاره‌وقت", "پارت تایم")) p.EmploymentType = Models.EmploymentType.PartTime;
        else if (text.Contains("طرح")) p.EmploymentType = Models.EmploymentType.Plan;
        else if (text.Contains("قرارداد")) p.EmploymentType = Models.EmploymentType.Contract;
        else if (ContainsAny(text, "تمام وقت", "تمام‌وقت")) p.EmploymentType = Models.EmploymentType.FullTime;

        // --- Gender requirement ---
        if (ContainsAny(text, "خانم", "خانوم", "بانو", "زن ", "مامای")) p.Gender = Gender.Female;
        else if (ContainsAny(text, "آقا", "اقا", "مرد ", "مرد،", "پسر")) p.Gender = Gender.Male;
        if (p.Gender != Gender.Any)
            p.Notes.Add($"جنسیت: {(p.Gender == Gender.Female ? "خانم" : "آقا")}");

        // --- City / district ---
        p.CityName = knownCities.FirstOrDefault(c => text.Contains(Normalize(c)));
        p.DistrictName = knownDistricts.OrderByDescending(d => d.Length)
            .FirstOrDefault(d => text.Contains(Normalize(d)));

        // --- Profit share (درصدی / سهم) ---
        var latinForShare = ToLatinDigits(text);
        var share = Regex.Match(latinForShare, @"(\d{1,3})\s*(?:٪|%|درصد)");
        if (!share.Success) share = Regex.Match(latinForShare, @"(?:٪|%)\s*(\d{1,3})");
        if (share.Success && int.TryParse(share.Groups[1].Value, out var pct) && pct is > 0 and <= 100)
        { p.SharePercent = pct; p.Notes.Add($"سهم درآمد: {pct}٪"); }
        else if (ContainsAny(text, "درصدی", "سهم درآمد", "شراکت", "پورسانت"))
        { p.Notes.Add("پرداخت درصدی/سهمی (درصد نامشخص)"); }

        // --- Fixed pay (strip phone numbers first so they're never read as money) ---
        if (ContainsAny(text, "توافقی", "توافق")) { p.PayNegotiable = true; p.Notes.Add("حقوق: توافقی"); }
        else
        {
            var amount = ExtractAmount(StripPhones(text));
            if (amount is not null) { p.PayAmount = amount; p.Notes.Add($"حقوق تخمینی: {amount:#,0} تومان"); }
            else if (p.SharePercent is null) p.Notes.Add("حقوق: تشخیص داده نشد");
        }

        // --- Talent extras (only meaningful for «آماده به کار») ---
        if (p.Kind == ListingKind.Talent)
        {
            var latinT = ToLatinDigits(text);
            var exp = Regex.Match(latinT, @"سابقه[^\d]{0,8}(\d{1,2})\s*سال");
            if (!exp.Success) exp = Regex.Match(latinT, @"(\d{1,2})\s*سال\s*سابقه");
            if (exp.Success && int.TryParse(exp.Groups[1].Value, out var yrs) && yrs is > 0 and <= 60)
            { p.YearsExperience = yrs; p.Notes.Add($"سابقه: {yrs} سال"); }

            p.IsLicensed = ContainsAny(text, "پروانه دار", "پروانه‌دار", "دارای پروانه", "پروانه فعالیت", "پروانه طبابت");
            if (p.IsLicensed) p.Notes.Add("پروانه‌دار");

            p.PersonName = ExtractPersonName(text);
            if (p.PersonName is not null) p.Notes.Add($"نام: {p.PersonName}");

            var area = Regex.Match(text, @"منطقه\s*[۰-۹0-9]{1,2}");
            if (area.Success) { p.AreaNote = area.Value.Trim(); p.Notes.Add($"محدوده: {p.AreaNote}"); }
        }

        // --- Facility name (بیمارستان/درمانگاه/کلینیک ... + the distinctive name) ---
        if (p.Kind != ListingKind.Talent)
        {
            p.FacilityName = ExtractFacilityName(text);
            if (p.FacilityName is not null) p.Notes.Add($"مرکز: {p.FacilityName}");
        }

        // --- Tags (certs/skills for deep search): mmt, icu, پروانه‌دار, اتاق عمل … ---
        p.Tags = ExtractTags(text);
        if (p.RoleNames.Count > 0) p.Tags.AddRange(p.RoleNames);
        if (p.IsLicensed && !p.Tags.Contains("پروانه‌دار")) p.Tags.Add("پروانه‌دار");
        p.Tags = p.Tags.Distinct().ToList();

        // --- Contacts (phones, email, socials — one ad may have several) ---
        p.Contacts = ExtractContacts(raw ?? text);
        p.Phone = p.Contacts.FirstOrDefault(c => c.Type is ContactType.Mobile or ContactType.Phone)?.Value;
        if (p.Contacts.Count > 0)
            p.Notes.Add("راه‌های ارتباطی: " + string.Join("، ", p.Contacts.Select(c => ContactLabel(c.Type))));

        return p;
    }

    // Words that introduce a facility name, longest/most-specific first.
    private static readonly string[] FacilityKeywords =
    {
        "بیمارستان", "زایشگاه", "پلی کلینیک", "پلیکلینیک", "درمانگاه", "کلینیک",
        "مرکز درمانی", "مرکز جراحی", "مجتمع پزشکی", "مجتمع درمانی", "مرکز", "مجتمع",
        "آزمایشگاه", "مطب", "خانه سالمندان", "سرای سالمندان",
    };

    // Words that clearly aren't part of a facility's name — stop collecting here.
    private static readonly string[] NameStops =
    {
        "جهت", "برای", "به", "با", "در", "از", "که", "نیاز", "نیازمند", "استخدام", "جذب",
        "دعوت", "همکاری", "واقع", "آدرس", "تلفن", "شماره", "شیفت", "ساعت", "حقوق", "روز",
        "شب", "صبح", "عصر", "می", "ها", "این", "یک", "محترم",
    };

    /// <summary>Best-effort hospital/clinic name: a facility keyword plus up to three name words.</summary>
    private static string? ExtractFacilityName(string text)
    {
        foreach (var kw in FacilityKeywords)
        {
            var idx = text.IndexOf(kw, StringComparison.Ordinal);
            if (idx < 0) continue;
            var after = text[(idx + kw.Length)..];
            var words = after.Split(
                new[] { ' ', '\n', '\r', '\t', '،', ',', '.', '؛', ':', '(', ')', '-', '/', '«', '»', '"' },
                StringSplitOptions.RemoveEmptyEntries);
            var picked = new List<string>();
            foreach (var w in words)
            {
                if (NameStops.Contains(w)) break;
                if (Regex.IsMatch(w, @"\d")) break;        // numbers/phones aren't names
                if (!w.Any(char.IsLetter)) break;          // emoji / punctuation («📍») isn't a name
                if (w.Length == 1) break;                  // stray letters
                picked.Add(w);
                if (picked.Count >= 3) break;
            }
            if (picked.Count == 0) continue;               // bare keyword (e.g. just «بیمارستان») isn't useful
            var candidate = (kw + " " + string.Join(" ", picked)).Trim();
            // Reject names that are only filler/verb/source noise («بیمارستان هستم», «... از مدجابز») —
            // a real name couldn't be extracted, so fall back to the shared placeholder downstream.
            if (Scraping.FacilityMatcher.IsJunkName(candidate)) continue;
            return candidate;
        }
        return null;
    }

    // Titles that introduce a person's name in «آماده به کار» posts.
    private static readonly string[] PersonTitles = { "دکتر", "خانم دکتر", "آقای دکتر", "مهندس", "سرکار خانم", "جناب آقای", "خانم", "آقای" };

    /// <summary>Best-effort person name: a title (دکتر/خانم/…) plus up to two following words.</summary>
    private static string? ExtractPersonName(string text)
    {
        foreach (var title in PersonTitles)
        {
            var idx = text.IndexOf(title, StringComparison.Ordinal);
            if (idx < 0) continue;
            var after = text[(idx + title.Length)..];
            var words = after.Split(
                new[] { ' ', '\n', '\r', '\t', '،', ',', '.', '؛', ':', '(', ')', '-', '/' },
                StringSplitOptions.RemoveEmptyEntries);
            var picked = new List<string>();
            foreach (var w in words)
            {
                if (NameStops.Contains(w)) break;
                if (Regex.IsMatch(w, @"[\d]")) break;
                if (w.Length == 1) break;
                picked.Add(w);
                if (picked.Count >= 2) break;
            }
            if (picked.Count == 0) continue;
            return (title + " " + string.Join(" ", picked)).Trim();
        }
        return null;
    }

    /// <summary>Remove phone numbers (and «شماره تماس…» lines) so they're not mistaken for money.</summary>
    private static string StripPhones(string text)
    {
        var t = Regex.Replace(text, @"شماره\s*(?:تماس|موبایل|همراه|ثابت|تلفن)[^\n]*", " ");
        t = ToLatinDigits(t);
        t = Regex.Replace(t, @"(?<!\d)(?:\+?98|0)?9\d{9}(?!\d)", " ");   // mobile
        t = Regex.Replace(t, @"(?<!\d)0\d{2,3}[\s-]?\d{7,8}(?!\d)", " "); // landline
        return t;
    }

    /// <summary>Pull a figure out of free text and normalize to TOMAN (ریال → تومان = ÷۱۰),
    /// handling «میلیون» and Persian digits.</summary>
    private static long? ExtractAmount(string text)
    {
        var latin = ToLatinDigits(text);
        bool hasToman = latin.Contains("تومان") || latin.Contains("تومن");
        bool hasRial = (latin.Contains("ریال") || latin.Contains("ريال")) && !hasToman;

        // e.g. "۲ میلیون" / "2.5 میلیون [ریال]"
        var million = Regex.Match(latin, @"(\d+(?:[.,]\d+)?)\s*میلیون\s*(ریال|ريال)?");
        if (million.Success && double.TryParse(million.Groups[1].Value.Replace(",", "."),
                System.Globalization.NumberStyles.Any, System.Globalization.CultureInfo.InvariantCulture, out var m))
        {
            var val = (long)(m * 1_000_000);
            if (million.Groups[2].Success) val /= 10;     // «میلیون ریال»
            return val;
        }

        // Largest plain number that looks like money (6–10 digits, no leading zero — a leading
        // zero or 11+ digits means it's a phone/id). Convert ریال→تومان by the unit next to the
        // number, else by the ad's overall currency.
        long best = 0;
        foreach (Match num in Regex.Matches(latin, @"(?<!\d)([1-9][\d٬,،.]{4,})\s*(ریال|ريال|تومان|تومن)?"))
        {
            var digits = Regex.Replace(num.Groups[1].Value, @"[^\d]", "");
            if (digits.Length is < 6 or > 10 || !long.TryParse(digits, out var v)) continue;
            var unit = num.Groups[2].Value;
            bool isRial = unit is "ریال" or "ريال" || (unit.Length == 0 && hasRial);
            if (isRial) v /= 10;
            if (v > best) best = v;
        }

        // Sanity: a monthly figure of 200M+ تومان is implausible in Iran — if the ad never said
        // «تومان», it was almost certainly ریال, so normalize.
        if (best >= 200_000_000 && !hasToman) best /= 10;
        return best > 0 ? best : null;
    }

    private static readonly Regex EmailRx = new(@"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}", RegexOptions.Compiled);
    private static readonly Regex UrlRx = new(@"https?://[^\s]+", RegexOptions.Compiled);

    private static string ContactLabel(ContactType t) => ContactInfo.Label(t);

    /// <summary>Pull every contact channel out of a post: phones, email, and socials (Instagram /
    /// Telegram / Bale / WhatsApp / website) via URLs and Persian keyword cues.</summary>
    private static List<ParsedContact> ExtractContacts(string raw)
    {
        var latin = ToLatinDigits(raw);
        var list = new List<ParsedContact>();
        void Add(ContactType t, string v)
        {
            v = v.Trim().Trim('.', '،', ',', ')', '(', ':', '«', '»', '"', '/').Trim();
            if (v.Length < 2) return;
            if (!list.Any(c => c.Type == t && string.Equals(c.Value, v, StringComparison.OrdinalIgnoreCase)))
                list.Add(new ParsedContact(t, v));
        }

        foreach (Match m in EmailRx.Matches(latin)) Add(ContactType.Email, m.Value);

        foreach (Match m in UrlRx.Matches(latin))
        {
            var u = m.Value.TrimEnd('.', '،', ')', '(', '"');
            var low = u.ToLowerInvariant();
            if (low.Contains("instagram.com") || low.Contains("instagr.am")) Add(ContactType.Instagram, UrlHandle(u));
            else if (low.Contains("t.me") || low.Contains("telegram.me")) Add(ContactType.Telegram, UrlHandle(u));
            else if (low.Contains("ble.ir") || low.Contains("bale.ai")) Add(ContactType.Bale, UrlHandle(u));
            else if (low.Contains("wa.me") || low.Contains("whatsapp")) Add(ContactType.WhatsApp, UrlHandle(u));
            else Add(ContactType.Website, u);
        }

        // Persian keyword → handle (latin handles only, so Persian words after the cue don't match).
        void Keyed(ContactType t, params string[] kws)
        {
            foreach (var kw in kws)
                foreach (Match m in Regex.Matches(latin, kw + @"\s*[:：]?\s*@?([A-Za-z0-9_.]{3,30})"))
                    Add(t, m.Groups[1].Value);
        }
        Keyed(ContactType.Instagram, "اینستاگرام", "اینستگرام", "اینستا", "پیج");
        Keyed(ContactType.Telegram, "تلگرام");
        Keyed(ContactType.WhatsApp, "واتساپ", "واتس اپ");

        // phones — mobiles then landlines (multiple), boundary-guarded.
        foreach (Match m in Regex.Matches(latin, @"(?<!\d)(?:\+?98|0)?9\d{9}(?!\d)"))
        {
            var d = Regex.Replace(m.Value, @"\D", "");
            if (d.StartsWith("98")) d = "0" + d[2..];
            if (d.Length == 10 && d[0] == '9') d = "0" + d;
            Add(ContactType.Mobile, d);
        }
        foreach (Match m in Regex.Matches(latin, @"(?<!\d)0\d{2,3}[\s-]?\d{7,8}(?!\d)"))
            Add(ContactType.Phone, Regex.Replace(m.Value, @"\D", ""));

        return list.Take(8).ToList();
    }

    // Canonical tag → trigger words found in the post.
    private static readonly (string Tag, string[] Needles)[] TagDict =
    {
        ("mmt", new[] { "mmt", "ام ام تی", "ام‌ام‌تی" }),
        ("ICU", new[] { "icu", "آی سی یو", "آی‌سی‌یو" }),
        ("CCU", new[] { "ccu", "سی سی یو", "سی‌سی‌یو" }),
        ("NICU", new[] { "nicu", "ان آی سی یو", "نوزادان" }),
        ("BLS", new[] { "bls" }),
        ("ACLS", new[] { "acls" }),
        ("دیالیز", new[] { "دیالیز" }),
        ("اتاق عمل", new[] { "اتاق عمل", "اسکراب" }),
        ("بیهوشی", new[] { "بیهوشی" }),
        ("تریاژ", new[] { "تریاژ" }),
        ("تزریقات", new[] { "تزریقات", "تزریق" }),
        ("پانسمان", new[] { "پانسمان", "زخم" }),
        ("سونوگرافی", new[] { "سونوگرافی" }),
        ("رادیولوژی", new[] { "رادیولوژی" }),
        ("اورژانس", new[] { "اورژانس", "فوریت" }),
        ("مسئول فنی", new[] { "مسئول فنی" }),
        ("طرح", new[] { "طرح" }),
        ("سالمند", new[] { "سالمند" }),
        ("کودک", new[] { "کودک", "اطفال" }),
        ("همراه بیمار", new[] { "همراه بیمار" }),
        ("پروانه‌دار", new[] { "پروانه" }),
    };

    private static List<string> ExtractTags(string text)
    {
        var tags = new List<string>();
        foreach (var (tag, needles) in TagDict)
            if (ContainsAny(text, needles)) tags.Add(tag);
        return tags;
    }

    private static string UrlHandle(string url)
    {
        var u = url.Split('?')[0].TrimEnd('/');
        var seg = u.Contains('/') ? u[(u.LastIndexOf('/') + 1)..] : u;
        return string.IsNullOrWhiteSpace(seg) ? url : seg;
    }

    private static string Normalize(string s) => s
        .Replace('ي', 'ی').Replace('ك', 'ک').Replace('‌', ' ').Trim();

    private static bool ContainsAny(string text, params string[] needles)
        => needles.Any(n => text.Contains(n));

    private static string ToLatinDigits(string s)
    {
        var chars = s.ToCharArray();
        for (var i = 0; i < chars.Length; i++)
        {
            if (chars[i] >= '۰' && chars[i] <= '۹') chars[i] = (char)('0' + (chars[i] - '۰'));
            else if (chars[i] >= '٠' && chars[i] <= '٩') chars[i] = (char)('0' + (chars[i] - '٠'));
        }
        return new string(chars);
    }
}