Add medjobs.ir scraper + employer/employee choice at signup

- MedjobsListingSource: crawls medjobs.ir sitemaps (ad_listing-sitemapN) → fetches ad pages → title+description → engine (dedupe/parse/validate/publish as SEO job pages). Configured in /Admin/Settings (enable + max ads/run). - Login/register now asks 'کادر درمان' vs 'کارفرما/مرکز': new accounts get Doctor vs FacilityAdmin role; post-login routes to /Me, /Employer, or /Admin accordingly. - Verified live: medjobs run fetched real ads into the review queue; employer signup → /Employer. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-04 06:12:10 +03:30
parent d828ea9f35
commit e2e26150cb
12 changed files with 1106 additions and 3 deletions
@@ -0,0 +1,112 @@
+using System.Text.RegularExpressions;
+using JobsMedical.Web.Models;
+
+namespace JobsMedical.Web.Services.Scraping;
+
+/// <summary>
+/// Scrapes job ads from medjobs.ir (a WordPress "ad_listing" classifieds site). It reads the
+/// site's own sitemaps (sitemap_index.xml → ad_listing-sitemapN.xml) to enumerate every ad URL,
+/// then fetches each ad page and extracts its title + description. The engine's content-hash
+/// dedupe means each ad is only ever ingested once, so repeated runs pick up only new ads.
+/// Published items become job pages on hamkadr.ir (the SEO goal).
+/// </summary>
+public class MedjobsListingSource : IListingSource
+{
+    private const string SitemapIndex = "https://medjobs.ir/sitemap_index.xml";
+    private readonly IHttpClientFactory _http;
+    private readonly ILogger<MedjobsListingSource> _log;
+
+    public MedjobsListingSource(IHttpClientFactory http, ILogger<MedjobsListingSource> log)
+    {
+        _http = http;
+        _log = log;
+    }
+
+    public string Name => "مدجابز (medjobs.ir)";
+
+    public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default)
+    {
+        if (!s.MedjobsEnabled) return Array.Empty<ScrapedItem>();
+        var max = Math.Clamp(s.MedjobsMaxAds, 1, 500);
+        var client = _http.CreateClient("scrape");
+
+        try
+        {
+            // 1. sitemap index → the ad_listing sitemaps
+            var index = await client.GetStringAsync(SitemapIndex, ct);
+            var adSitemaps = Locs(index).Where(u => u.Contains("ad_listing-sitemap")).ToList();
+            if (adSitemaps.Count == 0) { _log.LogWarning("medjobs: no ad_listing sitemaps found"); return Array.Empty<ScrapedItem>(); }
+
+            // 2. collect ad URLs (skip the bare /ads/ archive)
+            var adUrls = new List<string>();
+            foreach (var sm in adSitemaps)
+            {
+                if (adUrls.Count >= max) break;
+                try
+                {
+                    var body = await client.GetStringAsync(sm, ct);
+                    adUrls.AddRange(Locs(body).Where(u => u.Contains("/ads/") && !u.TrimEnd('/').EndsWith("/ads")));
+                }
+                catch (Exception ex) { _log.LogWarning(ex, "medjobs: sitemap {Sm} failed", sm); }
+            }
+            adUrls = adUrls.Distinct().Take(max).ToList();
+
+            // 3. fetch each ad page → title + description
+            var items = new List<ScrapedItem>();
+            foreach (var url in adUrls)
+            {
+                ct.ThrowIfCancellationRequested();
+                try
+                {
+                    var html = await client.GetStringAsync(url, ct);
+                    var text = ExtractAd(html);
+                    if (text.Length >= 25) items.Add(new ScrapedItem("مدجابز", text, url));
+                }
+                catch (Exception ex) { _log.LogWarning(ex, "medjobs: ad {Url} failed", url); }
+            }
+            _log.LogInformation("medjobs: fetched {Count} ads", items.Count);
+            return items;
+        }
+        catch (Exception ex)
+        {
+            _log.LogWarning(ex, "medjobs fetch failed");
+            return Array.Empty<ScrapedItem>();
+        }
+    }
+
+    private static IEnumerable<string> Locs(string xml)
+        => Regex.Matches(xml, "<loc>([^<]+)</loc>").Select(m => m.Groups[1].Value.Trim());
+
+    /// <summary>Title (og:title, site suffix stripped) + body (entry/description content or og:description).</summary>
+    private static string ExtractAd(string html)
+    {
+        var title = Meta(html, "og:title");
+        if (title is not null)
+        {
+            var bar = title.IndexOf('|');
+            if (bar > 10) title = title[..bar].Trim();
+        }
+
+        string? body = BetweenClass(html, "rtcl-description")
+                    ?? BetweenClass(html, "entry-content")
+                    ?? Meta(html, "og:description");
+
+        var parts = new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p));
+        var text = HtmlUtil.ToPlainText(string.Join("\n", parts));
+        return text.Length > 1800 ? text[..1800] : text;
+    }
+
+    private static string? Meta(string html, string prop)
+    {
+        var m = Regex.Match(html, $"<meta[^>]+property=[\"']{Regex.Escape(prop)}[\"'][^>]+content=[\"']([^\"']*)[\"']");
+        return m.Success ? System.Net.WebUtility.HtmlDecode(m.Groups[1].Value) : null;
+    }
+
+    /// <summary>Grab the inner HTML of the first &lt;div class="...name..."&gt; (best-effort).</summary>
+    private static string? BetweenClass(string html, string cls)
+    {
+        var m = Regex.Match(html, $"<div[^>]+class=[\"'][^\"']*{Regex.Escape(cls)}[^\"']*[\"'][^>]*>(.*?)</div>",
+            RegexOptions.Singleline);
+        return m.Success ? m.Groups[1].Value : null;
+    }
+}