Add iranestekhdam.ir as an ingestion source (clinical job ads at named facilities)
CI/CD / CI · dotnet build (push) Successful in 1m43s
CI/CD / Deploy · hamkadr (push) Successful in 1m55s

New IranEstekhdamListingSource: reads the site monthly ad sitemaps
(sitemap-ads.xml -> sitemap-ads-YYYY-M.xml), keeps only ad URLs whose Persian slug names a
clinical role (veterinary/non-clinical excluded), then extracts each ad title + description
(+ phone). These are employer ads at NAMED facilities, so they directly improve the
unknown-facility problem the classifieds content has.

Wired in like Medjobs: AppSetting toggles (IranEstekhdamEnabled/MaxAds/UseProxy) + EF
migration, SettingsService persistence, admin Settings UI, and DI registration. Off by
default; the medical-gate validator + AI auditor + junk filters screen results downstream.

Note: e-estekhdam / jobinja / jobvision are JS-rendered SPAs whose ad lists are not in static
HTML, so they need API reverse-engineering (a separate effort), not this static-scrape path.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-21 07:39:39 +03:30
parent da55f82c6c
commit f118db55ef
9 changed files with 1869 additions and 0 deletions
@@ -0,0 +1,135 @@
using System.Text.RegularExpressions;
using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services.Scraping;
/// <summary>
/// Scrapes clinical job ads from iranestekhdam.ir. It reads the site's monthly ad sitemaps
/// (sitemap-ads.xml → sitemap-ads-YYYY-M.xml) to enumerate ad URLs, keeps only those whose
/// readable Persian slug names a CLINICAL role (veterinary / non-clinical excluded), then fetches
/// each ad page and extracts its title + description (+ any phone). These are EMPLOYER ads at NAMED
/// facilities (بیمارستان/درمانگاه/کلینیک/آزمایشگاه …) — far higher quality than classifieds, so they
/// directly improve the «نامشخص»-facility problem. Content-hash dedupe ingests each ad once; the
/// medical-gate validator + AI auditor + junk filters do the final screening on top.
/// </summary>
public class IranEstekhdamListingSource : IListingSource
{
private const string SitemapIndex = "https://iranestekhdam.ir/sitemap-ads.xml";
private readonly ScrapeHttpClients _clients;
private readonly ILogger<IranEstekhdamListingSource> _log;
public IranEstekhdamListingSource(ScrapeHttpClients clients, ILogger<IranEstekhdamListingSource> log)
{
_clients = clients;
_log = log;
}
public string Name => "ایران‌استخدام (iranestekhdam.ir)";
// Clinical-role markers matched against the DECODED Persian URL slug. Words are hyphen-joined in
// the slug, so substring matching works on the decoded form.
private static readonly string[] RoleSlugs =
{
"پرستار", "بهیار", "کمک-پرستار", "کمک-بهیار", "پزشک", "دندان", "مامایی", "ماما", "تکنسین",
"رادیولوژ", "سونوگراف", "فیزیوتراپ", "کاردرمان", "گفتاردرمان", "شنوایی", "بینایی", "اپتومتر",
"دیالیز", "اتاق-عمل", "بیهوش", "تزریقات", "فوریت", "اورژانس", "داروساز", "نسخه", "سالمند",
};
// Slugs that share a substring with a clinical role but are NOT کادر درمان — drop them.
private static readonly string[] ExcludeSlugs = { "دامپزشک", "دام-پزشک", "دامپزشکی" };
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default)
{
if (!s.IranEstekhdamEnabled) return Array.Empty<ScrapedItem>();
var max = Math.Clamp(s.IranEstekhdamMaxAds, 1, 500);
var client = _clients.For(s, s.IranEstekhdamUseProxy);
try
{
// 1. sitemap index → the monthly ad sitemaps (newest first as listed by the site)
var index = await client.GetStringAsync(SitemapIndex, ct);
var monthly = Locs(index).Where(u => u.Contains("sitemap-ads-")).ToList();
if (monthly.Count == 0) { _log.LogWarning("iranestekhdam: no monthly ad sitemaps found"); return Array.Empty<ScrapedItem>(); }
// 2. collect ad URLs, keeping only clinical-role slugs. Pull from successive monthly
// sitemaps until we have enough candidates (or run out).
var picked = new List<string>();
foreach (var sm in monthly)
{
if (picked.Count >= max) break;
try
{
var clinical = Locs(await client.GetStringAsync(sm, ct)).Where(IsClinicalSlug);
foreach (var u in clinical) { if (!picked.Contains(u)) picked.Add(u); if (picked.Count >= max) break; }
}
catch (Exception ex) { _log.LogWarning(ex, "iranestekhdam: sitemap {Sm} failed", sm); }
}
// 3. fetch each ad page → title + description (+ phone if present in the body)
var items = new List<ScrapedItem>();
foreach (var url in picked)
{
ct.ThrowIfCancellationRequested();
try
{
var html = await client.GetStringAsync(url, ct);
var text = ExtractAd(html);
if (text.Length >= 25) items.Add(new ScrapedItem("ایران‌استخدام", text, url));
}
catch (Exception ex) { _log.LogWarning(ex, "iranestekhdam: ad {Url} failed", url); }
}
_log.LogInformation("iranestekhdam: fetched {Count} clinical ads", items.Count);
return items;
}
catch (Exception ex)
{
_log.LogWarning(ex, "iranestekhdam fetch failed");
return Array.Empty<ScrapedItem>();
}
}
private static bool IsClinicalSlug(string url)
{
var slug = Uri.UnescapeDataString(url);
if (ExcludeSlugs.Any(slug.Contains)) return false;
return RoleSlugs.Any(slug.Contains);
}
private static IEnumerable<string> Locs(string xml)
=> Regex.Matches(xml, "<loc>([^<]+)</loc>").Select(m => m.Groups[1].Value.Trim());
/// <summary>Title (site suffix stripped) + the ad's description. iranestekhdam puts a complete,
/// structured summary (facility + city + district + role) in og:description, with the full
/// requirements in the .single-ad container — prefer whichever yields more text.</summary>
private static string ExtractAd(string html)
{
var title = Meta(html, "og:title");
if (title is not null) { var bar = title.IndexOf('|'); if (bar > 10) title = title[..bar].Trim(); }
var ogBody = Meta(html, "og:description");
var single = BetweenClass(html, "single-ad");
var singleText = single is null ? null : HtmlUtil.ToPlainText(single);
var body = (singleText?.Length ?? 0) > (ogBody?.Length ?? 0) ? singleText : ogBody;
var text = HtmlUtil.ToPlainText(string.Join("\n", new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p))));
if (text.Length > 1800) text = text[..1800];
var phones = HtmlUtil.HarvestPhones(body ?? "");
if (phones.Count > 0 && !phones.Any(text.Contains))
text += "\nشماره تماس: " + string.Join("، ", phones);
return text;
}
private static string? Meta(string html, string prop)
{
var m = Regex.Match(html, $"<meta[^>]+property=[\"']{Regex.Escape(prop)}[\"'][^>]+content=[\"']([^\"']*)[\"']");
return m.Success ? System.Net.WebUtility.HtmlDecode(m.Groups[1].Value) : null;
}
private static string? BetweenClass(string html, string cls)
{
var m = Regex.Match(html, $"<(?:div|article|section)[^>]+class=[\"'][^\"']*{Regex.Escape(cls)}[^\"']*[\"'][^>]*>(.*?)</(?:div|article|section)>",
RegexOptions.Singleline);
return m.Success ? m.Groups[1].Value : null;
}
}
@@ -55,6 +55,9 @@ public class SettingsService
s.DivarQueries = incoming.DivarQueries?.Trim();
s.MedjobsEnabled = incoming.MedjobsEnabled;
s.MedjobsMaxAds = Math.Clamp(incoming.MedjobsMaxAds, 1, 500);
s.IranEstekhdamEnabled = incoming.IranEstekhdamEnabled;
s.IranEstekhdamMaxAds = Math.Clamp(incoming.IranEstekhdamMaxAds, 1, 500);
s.IranEstekhdamUseProxy = incoming.IranEstekhdamUseProxy;
s.SmsEnabled = incoming.SmsEnabled;
s.SmsApiKey = incoming.SmsApiKey?.Trim();
s.SmsTemplate = incoming.SmsTemplate?.Trim();