Add iranestekhdam.ir as an ingestion source (clinical job ads at named facilities)
New IranEstekhdamListingSource: reads the site monthly ad sitemaps (sitemap-ads.xml -> sitemap-ads-YYYY-M.xml), keeps only ad URLs whose Persian slug names a clinical role (veterinary/non-clinical excluded), then extracts each ad title + description (+ phone). These are employer ads at NAMED facilities, so they directly improve the unknown-facility problem the classifieds content has. Wired in like Medjobs: AppSetting toggles (IranEstekhdamEnabled/MaxAds/UseProxy) + EF migration, SettingsService persistence, admin Settings UI, and DI registration. Off by default; the medical-gate validator + AI auditor + junk filters screen results downstream. Note: e-estekhdam / jobinja / jobvision are JS-rendered SPAs whose ad lists are not in static HTML, so they need API reverse-engineering (a separate effort), not this static-scrape path. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,135 @@
|
||||
using System.Text.RegularExpressions;
|
||||
using JobsMedical.Web.Models;
|
||||
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
|
||||
/// <summary>
|
||||
/// Scrapes clinical job ads from iranestekhdam.ir. It reads the site's monthly ad sitemaps
|
||||
/// (sitemap-ads.xml → sitemap-ads-YYYY-M.xml) to enumerate ad URLs, keeps only those whose
|
||||
/// readable Persian slug names a CLINICAL role (veterinary / non-clinical excluded), then fetches
|
||||
/// each ad page and extracts its title + description (+ any phone). These are EMPLOYER ads at NAMED
|
||||
/// facilities (بیمارستان/درمانگاه/کلینیک/آزمایشگاه …) — far higher quality than classifieds, so they
|
||||
/// directly improve the «نامشخص»-facility problem. Content-hash dedupe ingests each ad once; the
|
||||
/// medical-gate validator + AI auditor + junk filters do the final screening on top.
|
||||
/// </summary>
|
||||
public class IranEstekhdamListingSource : IListingSource
|
||||
{
|
||||
private const string SitemapIndex = "https://iranestekhdam.ir/sitemap-ads.xml";
|
||||
private readonly ScrapeHttpClients _clients;
|
||||
private readonly ILogger<IranEstekhdamListingSource> _log;
|
||||
|
||||
public IranEstekhdamListingSource(ScrapeHttpClients clients, ILogger<IranEstekhdamListingSource> log)
|
||||
{
|
||||
_clients = clients;
|
||||
_log = log;
|
||||
}
|
||||
|
||||
public string Name => "ایراناستخدام (iranestekhdam.ir)";
|
||||
|
||||
// Clinical-role markers matched against the DECODED Persian URL slug. Words are hyphen-joined in
|
||||
// the slug, so substring matching works on the decoded form.
|
||||
private static readonly string[] RoleSlugs =
|
||||
{
|
||||
"پرستار", "بهیار", "کمک-پرستار", "کمک-بهیار", "پزشک", "دندان", "مامایی", "ماما", "تکنسین",
|
||||
"رادیولوژ", "سونوگراف", "فیزیوتراپ", "کاردرمان", "گفتاردرمان", "شنوایی", "بینایی", "اپتومتر",
|
||||
"دیالیز", "اتاق-عمل", "بیهوش", "تزریقات", "فوریت", "اورژانس", "داروساز", "نسخه", "سالمند",
|
||||
};
|
||||
|
||||
// Slugs that share a substring with a clinical role but are NOT کادر درمان — drop them.
|
||||
private static readonly string[] ExcludeSlugs = { "دامپزشک", "دام-پزشک", "دامپزشکی" };
|
||||
|
||||
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default)
|
||||
{
|
||||
if (!s.IranEstekhdamEnabled) return Array.Empty<ScrapedItem>();
|
||||
var max = Math.Clamp(s.IranEstekhdamMaxAds, 1, 500);
|
||||
var client = _clients.For(s, s.IranEstekhdamUseProxy);
|
||||
|
||||
try
|
||||
{
|
||||
// 1. sitemap index → the monthly ad sitemaps (newest first as listed by the site)
|
||||
var index = await client.GetStringAsync(SitemapIndex, ct);
|
||||
var monthly = Locs(index).Where(u => u.Contains("sitemap-ads-")).ToList();
|
||||
if (monthly.Count == 0) { _log.LogWarning("iranestekhdam: no monthly ad sitemaps found"); return Array.Empty<ScrapedItem>(); }
|
||||
|
||||
// 2. collect ad URLs, keeping only clinical-role slugs. Pull from successive monthly
|
||||
// sitemaps until we have enough candidates (or run out).
|
||||
var picked = new List<string>();
|
||||
foreach (var sm in monthly)
|
||||
{
|
||||
if (picked.Count >= max) break;
|
||||
try
|
||||
{
|
||||
var clinical = Locs(await client.GetStringAsync(sm, ct)).Where(IsClinicalSlug);
|
||||
foreach (var u in clinical) { if (!picked.Contains(u)) picked.Add(u); if (picked.Count >= max) break; }
|
||||
}
|
||||
catch (Exception ex) { _log.LogWarning(ex, "iranestekhdam: sitemap {Sm} failed", sm); }
|
||||
}
|
||||
|
||||
// 3. fetch each ad page → title + description (+ phone if present in the body)
|
||||
var items = new List<ScrapedItem>();
|
||||
foreach (var url in picked)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
try
|
||||
{
|
||||
var html = await client.GetStringAsync(url, ct);
|
||||
var text = ExtractAd(html);
|
||||
if (text.Length >= 25) items.Add(new ScrapedItem("ایراناستخدام", text, url));
|
||||
}
|
||||
catch (Exception ex) { _log.LogWarning(ex, "iranestekhdam: ad {Url} failed", url); }
|
||||
}
|
||||
_log.LogInformation("iranestekhdam: fetched {Count} clinical ads", items.Count);
|
||||
return items;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_log.LogWarning(ex, "iranestekhdam fetch failed");
|
||||
return Array.Empty<ScrapedItem>();
|
||||
}
|
||||
}
|
||||
|
||||
private static bool IsClinicalSlug(string url)
|
||||
{
|
||||
var slug = Uri.UnescapeDataString(url);
|
||||
if (ExcludeSlugs.Any(slug.Contains)) return false;
|
||||
return RoleSlugs.Any(slug.Contains);
|
||||
}
|
||||
|
||||
private static IEnumerable<string> Locs(string xml)
|
||||
=> Regex.Matches(xml, "<loc>([^<]+)</loc>").Select(m => m.Groups[1].Value.Trim());
|
||||
|
||||
/// <summary>Title (site suffix stripped) + the ad's description. iranestekhdam puts a complete,
|
||||
/// structured summary (facility + city + district + role) in og:description, with the full
|
||||
/// requirements in the .single-ad container — prefer whichever yields more text.</summary>
|
||||
private static string ExtractAd(string html)
|
||||
{
|
||||
var title = Meta(html, "og:title");
|
||||
if (title is not null) { var bar = title.IndexOf('|'); if (bar > 10) title = title[..bar].Trim(); }
|
||||
|
||||
var ogBody = Meta(html, "og:description");
|
||||
var single = BetweenClass(html, "single-ad");
|
||||
var singleText = single is null ? null : HtmlUtil.ToPlainText(single);
|
||||
var body = (singleText?.Length ?? 0) > (ogBody?.Length ?? 0) ? singleText : ogBody;
|
||||
|
||||
var text = HtmlUtil.ToPlainText(string.Join("\n", new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p))));
|
||||
if (text.Length > 1800) text = text[..1800];
|
||||
|
||||
var phones = HtmlUtil.HarvestPhones(body ?? "");
|
||||
if (phones.Count > 0 && !phones.Any(text.Contains))
|
||||
text += "\nشماره تماس: " + string.Join("، ", phones);
|
||||
return text;
|
||||
}
|
||||
|
||||
private static string? Meta(string html, string prop)
|
||||
{
|
||||
var m = Regex.Match(html, $"<meta[^>]+property=[\"']{Regex.Escape(prop)}[\"'][^>]+content=[\"']([^\"']*)[\"']");
|
||||
return m.Success ? System.Net.WebUtility.HtmlDecode(m.Groups[1].Value) : null;
|
||||
}
|
||||
|
||||
private static string? BetweenClass(string html, string cls)
|
||||
{
|
||||
var m = Regex.Match(html, $"<(?:div|article|section)[^>]+class=[\"'][^\"']*{Regex.Escape(cls)}[^\"']*[\"'][^>]*>(.*?)</(?:div|article|section)>",
|
||||
RegexOptions.Singleline);
|
||||
return m.Success ? m.Groups[1].Value : null;
|
||||
}
|
||||
}
|
||||
@@ -55,6 +55,9 @@ public class SettingsService
|
||||
s.DivarQueries = incoming.DivarQueries?.Trim();
|
||||
s.MedjobsEnabled = incoming.MedjobsEnabled;
|
||||
s.MedjobsMaxAds = Math.Clamp(incoming.MedjobsMaxAds, 1, 500);
|
||||
s.IranEstekhdamEnabled = incoming.IranEstekhdamEnabled;
|
||||
s.IranEstekhdamMaxAds = Math.Clamp(incoming.IranEstekhdamMaxAds, 1, 500);
|
||||
s.IranEstekhdamUseProxy = incoming.IranEstekhdamUseProxy;
|
||||
s.SmsEnabled = incoming.SmsEnabled;
|
||||
s.SmsApiKey = incoming.SmsApiKey?.Trim();
|
||||
s.SmsTemplate = incoming.SmsTemplate?.Trim();
|
||||
|
||||
Reference in New Issue
Block a user