hamkadr/src/JobsMedical.Web/Services/Scraping/MedjobsListingSource.cs

using System.Text.RegularExpressions;
using JobsMedical.Web.Models;

namespace JobsMedical.Web.Services.Scraping;

/// <summary>
/// Scrapes job ads from medjobs.ir (a WordPress "ad_listing" classifieds site). It reads the
/// site's own sitemaps (sitemap_index.xml → ad_listing-sitemapN.xml) to enumerate every ad URL,
/// then fetches each ad page and extracts its title + description. The engine's content-hash
/// dedupe means each ad is only ever ingested once, so repeated runs pick up only new ads.
/// Published items become job pages on hamkadr.ir (the SEO goal).
/// </summary>
public class MedjobsListingSource : IListingSource
{
    private const string SitemapIndex = "https://medjobs.ir/sitemap_index.xml";
    private readonly ScrapeHttpClients _clients;
    private readonly ILogger<MedjobsListingSource> _log;

    public MedjobsListingSource(ScrapeHttpClients clients, ILogger<MedjobsListingSource> log)
    {
        _clients = clients;
        _log = log;
    }

    public string Name => "مدجابز (medjobs.ir)";

    public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default)
    {
        if (!s.MedjobsEnabled) return Array.Empty<ScrapedItem>();
        var max = Math.Clamp(s.MedjobsMaxAds, 1, 500);
        var client = _clients.For(s, s.MedjobsUseProxy);

        try
        {
            // 1. sitemap index → the ad_listing sitemaps
            var index = await client.GetStringAsync(SitemapIndex, ct);
            var adSitemaps = Locs(index).Where(u => u.Contains("ad_listing-sitemap")).ToList();
            if (adSitemaps.Count == 0) { _log.LogWarning("medjobs: no ad_listing sitemaps found"); return Array.Empty<ScrapedItem>(); }

            // 2. collect ad URLs (skip the bare /ads/ archive)
            var adUrls = new List<string>();
            foreach (var sm in adSitemaps)
            {
                if (adUrls.Count >= max) break;
                try
                {
                    var body = await client.GetStringAsync(sm, ct);
                    adUrls.AddRange(Locs(body).Where(u => u.Contains("/ads/") && !u.TrimEnd('/').EndsWith("/ads")));
                }
                catch (Exception ex) { _log.LogWarning(ex, "medjobs: sitemap {Sm} failed", sm); }
            }
            adUrls = adUrls.Distinct().Take(max).ToList();

            // 3. fetch each ad page → title + description
            var items = new List<ScrapedItem>();
            foreach (var url in adUrls)
            {
                ct.ThrowIfCancellationRequested();
                try
                {
                    var html = await client.GetStringAsync(url, ct);
                    var text = ExtractAd(html);
                    if (text.Length >= 25) items.Add(new ScrapedItem("مدجابز", text, url));
                }
                catch (Exception ex) { _log.LogWarning(ex, "medjobs: ad {Url} failed", url); }
            }
            _log.LogInformation("medjobs: fetched {Count} ads", items.Count);
            return items;
        }
        catch (Exception ex)
        {
            _log.LogWarning(ex, "medjobs fetch failed");
            return Array.Empty<ScrapedItem>();
        }
    }

    private static IEnumerable<string> Locs(string xml)
        => Regex.Matches(xml, "<loc>([^<]+)</loc>").Select(m => m.Groups[1].Value.Trim());

    /// <summary>Title (og:title, site suffix stripped) + body (entry/description content or og:description).</summary>
    private static string ExtractAd(string html)
    {
        var title = Meta(html, "og:title");
        if (title is not null)
        {
            var bar = title.IndexOf('|');
            if (bar > 10) title = title[..bar].Trim();
        }

        string? body = BetweenClass(html, "rtcl-description")
                    ?? BetweenClass(html, "entry-content")
                    ?? Meta(html, "og:description");

        var parts = new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p));
        var text = HtmlUtil.ToPlainText(string.Join("\n", parts));
        if (text.Length > 1800) text = text[..1800];

        // The contact number is often outside the description (in a tel: link / data attribute the
        // page reveals on click). Harvest it from the full HTML and append so the parser/AI see it.
        var phones = HtmlUtil.HarvestPhones(html);
        if (phones.Count > 0 && !phones.Any(text.Contains))
            text += "\nشماره تماس: " + string.Join("، ", phones);
        return text;
    }

    private static string? Meta(string html, string prop)
    {
        var m = Regex.Match(html, $"<meta[^>]+property=[\"']{Regex.Escape(prop)}[\"'][^>]+content=[\"']([^\"']*)[\"']");
        return m.Success ? System.Net.WebUtility.HtmlDecode(m.Groups[1].Value) : null;
    }

    /// <summary>Grab the inner HTML of the first &lt;div class="...name..."&gt; (best-effort).</summary>
    private static string? BetweenClass(string html, string cls)
    {
        var m = Regex.Match(html, $"<div[^>]+class=[\"'][^\"']*{Regex.Escape(cls)}[^\"']*[\"'][^>]*>(.*?)</div>",
            RegexOptions.Singleline);
        return m.Success ? m.Groups[1].Value : null;
    }
}