src/JobsMedical.Web/Services/Scraping/MedjobsListingSource.cs

using System.Text.RegularExpressions;
using JobsMedical.Web.Models;

namespace JobsMedical.Web.Services.Scraping;

/// <summary>
/// Scrapes job ads from medjobs.ir (a WordPress "ad_listing" classifieds site). It reads the
/// site's own sitemaps (sitemap_index.xml → ad_listing-sitemapN.xml) to enumerate every ad URL,
/// then fetches each ad page and extracts its title + description. The engine's content-hash
/// dedupe means each ad is only ever ingested once, so repeated runs pick up only new ads.
/// Published items become job pages on hamkadr.ir (the SEO goal).
/// </summary>
public class MedjobsListingSource : IListingSource
{
    private const string SitemapIndex = "https://medjobs.ir/sitemap_index.xml";
    private readonly IHttpClientFactory _http;
    private readonly ILogger<MedjobsListingSource> _log;

    public MedjobsListingSource(IHttpClientFactory http, ILogger<MedjobsListingSource> log)
    {
        _http = http;
        _log = log;
    }

    public string Name => "مدجابز (medjobs.ir)";

    public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default)
    {
        if (!s.MedjobsEnabled) return Array.Empty<ScrapedItem>();
        var max = Math.Clamp(s.MedjobsMaxAds, 1, 500);
        var client = _http.CreateClient("scrape");

        try
        {
            // 1. sitemap index → the ad_listing sitemaps
            var index = await client.GetStringAsync(SitemapIndex, ct);
            var adSitemaps = Locs(index).Where(u => u.Contains("ad_listing-sitemap")).ToList();
            if (adSitemaps.Count == 0) { _log.LogWarning("medjobs: no ad_listing sitemaps found"); return Array.Empty<ScrapedItem>(); }

            // 2. collect ad URLs (skip the bare /ads/ archive)
            var adUrls = new List<string>();
            foreach (var sm in adSitemaps)
            {
                if (adUrls.Count >= max) break;
                try
                {
                    var body = await client.GetStringAsync(sm, ct);
                    adUrls.AddRange(Locs(body).Where(u => u.Contains("/ads/") && !u.TrimEnd('/').EndsWith("/ads")));
                }
                catch (Exception ex) { _log.LogWarning(ex, "medjobs: sitemap {Sm} failed", sm); }
            }
            adUrls = adUrls.Distinct().Take(max).ToList();

            // 3. fetch each ad page → title + description
            var items = new List<ScrapedItem>();
            foreach (var url in adUrls)
            {
                ct.ThrowIfCancellationRequested();
                try
                {
                    var html = await client.GetStringAsync(url, ct);
                    var text = ExtractAd(html);
                    if (text.Length >= 25) items.Add(new ScrapedItem("مدجابز", text, url));
                }
                catch (Exception ex) { _log.LogWarning(ex, "medjobs: ad {Url} failed", url); }
            }
            _log.LogInformation("medjobs: fetched {Count} ads", items.Count);
            return items;
        }
        catch (Exception ex)
        {
            _log.LogWarning(ex, "medjobs fetch failed");
            return Array.Empty<ScrapedItem>();
        }
    }

    private static IEnumerable<string> Locs(string xml)
        => Regex.Matches(xml, "<loc>([^<]+)</loc>").Select(m => m.Groups[1].Value.Trim());

    /// <summary>Title (og:title, site suffix stripped) + body (entry/description content or og:description).</summary>
    private static string ExtractAd(string html)
    {
        var title = Meta(html, "og:title");
        if (title is not null)
        {
            var bar = title.IndexOf('|');
            if (bar > 10) title = title[..bar].Trim();
        }

        string? body = BetweenClass(html, "rtcl-description")
                    ?? BetweenClass(html, "entry-content")
                    ?? Meta(html, "og:description");

        var parts = new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p));
        var text = HtmlUtil.ToPlainText(string.Join("\n", parts));
        return text.Length > 1800 ? text[..1800] : text;
    }

    private static string? Meta(string html, string prop)
    {
        var m = Regex.Match(html, $"<meta[^>]+property=[\"']{Regex.Escape(prop)}[\"'][^>]+content=[\"']([^\"']*)[\"']");
        return m.Success ? System.Net.WebUtility.HtmlDecode(m.Groups[1].Value) : null;
    }

    /// <summary>Grab the inner HTML of the first &lt;div class="...name..."&gt; (best-effort).</summary>
    private static string? BetweenClass(string html, string cls)
    {
        var m = Regex.Match(html, $"<div[^>]+class=[\"'][^\"']*{Regex.Escape(cls)}[^\"']*[\"'][^>]*>(.*?)</div>",
            RegexOptions.Singleline);
        return m.Success ? m.Groups[1].Value : null;
    }
}
Add medjobs.ir scraper + employer/employee choice at signup 2026-06-04 06:12:10 +03:30			`using System.Text.RegularExpressions;`
			`using JobsMedical.Web.Models;`

			`namespace JobsMedical.Web.Services.Scraping;`

			`/// <summary>`
			`/// Scrapes job ads from medjobs.ir (a WordPress "ad_listing" classifieds site). It reads the`
			`/// site's own sitemaps (sitemap_index.xml → ad_listing-sitemapN.xml) to enumerate every ad URL,`
			`/// then fetches each ad page and extracts its title + description. The engine's content-hash`
			`/// dedupe means each ad is only ever ingested once, so repeated runs pick up only new ads.`
			`/// Published items become job pages on hamkadr.ir (the SEO goal).`
			`/// </summary>`
			`public class MedjobsListingSource : IListingSource`
			`{`
			`private const string SitemapIndex = "https://medjobs.ir/sitemap_index.xml";`
			`private readonly IHttpClientFactory _http;`
			`private readonly ILogger<MedjobsListingSource> _log;`

			`public MedjobsListingSource(IHttpClientFactory http, ILogger<MedjobsListingSource> log)`
			`{`
			`_http = http;`
			`_log = log;`
			`}`

			`public string Name => "مدجابز (medjobs.ir)";`

			`public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default)`
			`{`
			`if (!s.MedjobsEnabled) return Array.Empty<ScrapedItem>();`
			`var max = Math.Clamp(s.MedjobsMaxAds, 1, 500);`
			`var client = _http.CreateClient("scrape");`

			`try`
			`{`
			`// 1. sitemap index → the ad_listing sitemaps`
			`var index = await client.GetStringAsync(SitemapIndex, ct);`
			`var adSitemaps = Locs(index).Where(u => u.Contains("ad_listing-sitemap")).ToList();`
			`if (adSitemaps.Count == 0) { _log.LogWarning("medjobs: no ad_listing sitemaps found"); return Array.Empty<ScrapedItem>(); }`

			`// 2. collect ad URLs (skip the bare /ads/ archive)`
			`var adUrls = new List<string>();`
			`foreach (var sm in adSitemaps)`
			`{`
			`if (adUrls.Count >= max) break;`
			`try`
			`{`
			`var body = await client.GetStringAsync(sm, ct);`
			`adUrls.AddRange(Locs(body).Where(u => u.Contains("/ads/") && !u.TrimEnd('/').EndsWith("/ads")));`
			`}`
			`catch (Exception ex) { _log.LogWarning(ex, "medjobs: sitemap {Sm} failed", sm); }`
			`}`
			`adUrls = adUrls.Distinct().Take(max).ToList();`

			`// 3. fetch each ad page → title + description`
			`var items = new List<ScrapedItem>();`
			`foreach (var url in adUrls)`
			`{`
			`ct.ThrowIfCancellationRequested();`
			`try`
			`{`
			`var html = await client.GetStringAsync(url, ct);`
			`var text = ExtractAd(html);`
			`if (text.Length >= 25) items.Add(new ScrapedItem("مدجابز", text, url));`
			`}`
			`catch (Exception ex) { _log.LogWarning(ex, "medjobs: ad {Url} failed", url); }`
			`}`
			`_log.LogInformation("medjobs: fetched {Count} ads", items.Count);`
			`return items;`
			`}`
			`catch (Exception ex)`
			`{`
			`_log.LogWarning(ex, "medjobs fetch failed");`
			`return Array.Empty<ScrapedItem>();`
			`}`
			`}`

			`private static IEnumerable<string> Locs(string xml)`
			`=> Regex.Matches(xml, "<loc>([^<]+)</loc>").Select(m => m.Groups[1].Value.Trim());`

			`/// <summary>Title (og:title, site suffix stripped) + body (entry/description content or og:description).</summary>`
			`private static string ExtractAd(string html)`
			`{`
			`var title = Meta(html, "og:title");`
			`if (title is not null)`
			`{`
			`var bar = title.IndexOf('\|');`
			`if (bar > 10) title = title[..bar].Trim();`
			`}`

			`string? body = BetweenClass(html, "rtcl-description")`
			`?? BetweenClass(html, "entry-content")`
			`?? Meta(html, "og:description");`

			`var parts = new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p));`
			`var text = HtmlUtil.ToPlainText(string.Join("\n", parts));`
			`return text.Length > 1800 ? text[..1800] : text;`
			`}`

			`private static string? Meta(string html, string prop)`
			`{`
			`var m = Regex.Match(html, $"<meta[^>]+property=[\"']{Regex.Escape(prop)}[\"'][^>]+content=[\"']([^\"']*)[\"']");`
			`return m.Success ? System.Net.WebUtility.HtmlDecode(m.Groups[1].Value) : null;`
			`}`

			`/// <summary>Grab the inner HTML of the first <div class="...name..."> (best-effort).</summary>`
			`private static string? BetweenClass(string html, string cls)`
			`{`
			`var m = Regex.Match(html, $"<div[^>]+class=[\"'][^\"']{Regex.Escape(cls)}[^\"'][\"'][^>]>(.?)</div>",`
			`RegexOptions.Singleline);`
			`return m.Success ? m.Groups[1].Value : null;`
			`}`
			`}`