Add medboom.ir as an ingestion source (doctor/dentist-heavy, VPN-free)
New MedboomListingSource: a WordPress medical-classifieds board crawled like medjobs (wp-sitemap.xml -> posts-post-N.xml, newest first), filtered to clinical-role slugs and Tehran-only for launch. medboom skews toward doctors/dentists/pharmacists and carries both hiring and availability posts, so it directly broadens the role mix the nurse-heavy Divar content lacks. Iranian-hosted -> no proxy/VPN needed (relevant now that Telegram is off). Wired like the other sources: AppSetting toggles (MedboomEnabled/MaxAds/UseProxy) + EF migration, SettingsService persistence, admin Settings UI, DI registration. Off by default. Validated against live data: Tehran clinical ads at named clinics (pharmacy/dental/etc.). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,155 @@
|
||||
using System.Text.RegularExpressions;
|
||||
using JobsMedical.Web.Models;
|
||||
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
|
||||
/// <summary>
|
||||
/// Scrapes clinical ads from medboom.ir («مرجع استخدام و نیازمندی علوم پزشکی») — a WordPress
|
||||
/// ad-listing site like medjobs.ir. It enumerates ad posts via the WP sitemap
|
||||
/// (wp-sitemap.xml → wp-sitemap-posts-post-N.xml), newest first, keeps clinical-role slugs, and
|
||||
/// extracts each ad's title + description (+ phone). medboom skews toward DOCTORS/DENTISTS and
|
||||
/// carries BOTH hiring («نیازمند…») and availability («آماده همکاری / جویای کار») posts, so it
|
||||
/// directly broadens the role mix the nurse-heavy classifieds sources miss. Tehran-only for launch.
|
||||
/// VPN-free (Iranian-hosted). Content-hash dedupe ingests each ad once; the validator/AI screen on top.
|
||||
/// </summary>
|
||||
public class MedboomListingSource : IListingSource
|
||||
{
|
||||
private const string SitemapIndex = "https://medboom.ir/wp-sitemap.xml";
|
||||
private readonly ScrapeHttpClients _clients;
|
||||
private readonly ILogger<MedboomListingSource> _log;
|
||||
|
||||
public MedboomListingSource(ScrapeHttpClients clients, ILogger<MedboomListingSource> log)
|
||||
{
|
||||
_clients = clients;
|
||||
_log = log;
|
||||
}
|
||||
|
||||
public string Name => "مدبوم (medboom.ir)";
|
||||
|
||||
// Clinical-role markers matched against the decoded Persian ad slug.
|
||||
private static readonly string[] RoleSlugs =
|
||||
{
|
||||
"پزشک", "دندان", "پرستار", "بهیار", "مامایی", "ماما", "تکنسین", "رادیولوژ", "سونوگراف",
|
||||
"فیزیوتراپ", "کاردرمان", "گفتاردرمان", "شنوایی", "بینایی", "اپتومتر", "دیالیز", "اتاق-عمل",
|
||||
"بیهوش", "هوشبری", "تزریقات", "فوریت", "اورژانس", "داروساز", "داروخانه", "نسخه", "سالمند",
|
||||
"علوم-آزمایشگاهی", "آزمایشگاه", "مسئول-فنی", "مامو", "تغذیه", "روانشناس", "اپتیک",
|
||||
};
|
||||
// Veterinary + obvious non-staffing categories medboom also carries (equipment sale, real estate).
|
||||
private static readonly string[] ExcludeSlugs =
|
||||
{
|
||||
"دامپزشک", "دام-پزشک", "دامپزشکی", "فروش", "اجاره", "املاک", "دستگاه", "تجهیزات", "ملک",
|
||||
};
|
||||
|
||||
private const string Tehran = "تهران";
|
||||
private static readonly string[] OtherCitySlugs =
|
||||
{
|
||||
"شیراز", "اصفهان", "مشهد", "تبریز", "کرج", "قم", "یزد", "رشت", "کرمان", "اراک", "اردبیل",
|
||||
"همدان", "کرمانشاه", "زنجان", "قزوین", "ساری", "گرگان", "بندرعباس", "بوشهر", "سنندج",
|
||||
"بیرجند", "سمنان", "شهرکرد", "ایلام", "یاسوج", "زاهدان", "ارومیه", "البرز", "اهواز", "کاشان",
|
||||
};
|
||||
|
||||
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default)
|
||||
{
|
||||
if (!s.MedboomEnabled) return Array.Empty<ScrapedItem>();
|
||||
var max = Math.Clamp(s.MedboomMaxAds, 1, 500);
|
||||
var client = _clients.For(s, s.MedboomUseProxy);
|
||||
|
||||
try
|
||||
{
|
||||
// 1. WP sitemap index → the ad-post sitemaps. Process newest first (highest-numbered).
|
||||
var index = await client.GetStringAsync(SitemapIndex, ct);
|
||||
var postMaps = Locs(index).Where(u => u.Contains("posts-post-"))
|
||||
.OrderByDescending(u => u).ToList();
|
||||
if (postMaps.Count == 0) { _log.LogWarning("medboom: no ad-post sitemaps found"); return Array.Empty<ScrapedItem>(); }
|
||||
|
||||
// 2. pool clinical candidate URLs (newest first within each map), pre-dropping other cities.
|
||||
var pool = new List<string>();
|
||||
var budget = max * 6;
|
||||
foreach (var sm in postMaps)
|
||||
{
|
||||
if (pool.Count >= budget) break;
|
||||
try
|
||||
{
|
||||
var urls = Locs(await client.GetStringAsync(sm, ct)).Reverse(); // newest ads last → take from end
|
||||
foreach (var u in urls)
|
||||
{
|
||||
if (IsClinicalSlug(u) && !IsOtherCitySlug(u) && !pool.Contains(u)) pool.Add(u);
|
||||
if (pool.Count >= budget) break;
|
||||
}
|
||||
}
|
||||
catch (Exception ex) { _log.LogWarning(ex, "medboom: sitemap {Sm} failed", sm); }
|
||||
}
|
||||
|
||||
// 3. fetch each ad → keep only Tehran ones, up to `max`.
|
||||
var items = new List<ScrapedItem>();
|
||||
foreach (var url in pool)
|
||||
{
|
||||
if (items.Count >= max) break;
|
||||
ct.ThrowIfCancellationRequested();
|
||||
try
|
||||
{
|
||||
var html = await client.GetStringAsync(url, ct);
|
||||
var text = ExtractAd(html);
|
||||
if (text.Length < 25 || !text.Contains(Tehran)) continue; // Tehran-only launch filter
|
||||
items.Add(new ScrapedItem("مدبوم", text, url));
|
||||
}
|
||||
catch (Exception ex) { _log.LogWarning(ex, "medboom: ad {Url} failed", url); }
|
||||
}
|
||||
_log.LogInformation("medboom: fetched {Count} Tehran clinical ads (from {Pool} pooled)", items.Count, pool.Count);
|
||||
return items;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_log.LogWarning(ex, "medboom fetch failed");
|
||||
return Array.Empty<ScrapedItem>();
|
||||
}
|
||||
}
|
||||
|
||||
private static bool IsClinicalSlug(string url)
|
||||
{
|
||||
var slug = Uri.UnescapeDataString(url);
|
||||
if (ExcludeSlugs.Any(slug.Contains)) return false;
|
||||
return RoleSlugs.Any(slug.Contains);
|
||||
}
|
||||
|
||||
private static bool IsOtherCitySlug(string url)
|
||||
{
|
||||
var slug = Uri.UnescapeDataString(url);
|
||||
return OtherCitySlugs.Any(slug.Contains);
|
||||
}
|
||||
|
||||
private static IEnumerable<string> Locs(string xml)
|
||||
=> Regex.Matches(xml, "<loc>([^<]+)</loc>").Select(m => m.Groups[1].Value.Trim());
|
||||
|
||||
private static string ExtractAd(string html)
|
||||
{
|
||||
var title = Meta(html, "og:title");
|
||||
if (title is not null) { var bar = title.IndexOf('|'); if (bar > 10) title = title[..bar].Trim(); }
|
||||
|
||||
var ogBody = Meta(html, "og:description");
|
||||
var entry = BetweenClass(html, "entry-content");
|
||||
var entryText = entry is null ? null : HtmlUtil.ToPlainText(entry);
|
||||
var body = (entryText?.Length ?? 0) > (ogBody?.Length ?? 0) ? entryText : ogBody;
|
||||
|
||||
var text = HtmlUtil.ToPlainText(string.Join("\n", new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p))));
|
||||
if (text.Length > 1800) text = text[..1800];
|
||||
|
||||
var phones = HtmlUtil.HarvestPhones(body ?? "");
|
||||
if (phones.Count > 0 && !phones.Any(text.Contains))
|
||||
text += "\nشماره تماس: " + string.Join("، ", phones);
|
||||
return text;
|
||||
}
|
||||
|
||||
private static string? Meta(string html, string prop)
|
||||
{
|
||||
var m = Regex.Match(html, $"<meta[^>]+property=[\"']{Regex.Escape(prop)}[\"'][^>]+content=[\"']([^\"']*)[\"']");
|
||||
return m.Success ? System.Net.WebUtility.HtmlDecode(m.Groups[1].Value) : null;
|
||||
}
|
||||
|
||||
private static string? BetweenClass(string html, string cls)
|
||||
{
|
||||
var m = Regex.Match(html, $"<(?:div|article|section)[^>]+class=[\"'][^\"']*{Regex.Escape(cls)}[^\"']*[\"'][^>]*>(.*?)</(?:div|article|section)>",
|
||||
RegexOptions.Singleline);
|
||||
return m.Success ? m.Groups[1].Value : null;
|
||||
}
|
||||
}
|
||||
@@ -58,6 +58,9 @@ public class SettingsService
|
||||
s.IranEstekhdamEnabled = incoming.IranEstekhdamEnabled;
|
||||
s.IranEstekhdamMaxAds = Math.Clamp(incoming.IranEstekhdamMaxAds, 1, 500);
|
||||
s.IranEstekhdamUseProxy = incoming.IranEstekhdamUseProxy;
|
||||
s.MedboomEnabled = incoming.MedboomEnabled;
|
||||
s.MedboomMaxAds = Math.Clamp(incoming.MedboomMaxAds, 1, 500);
|
||||
s.MedboomUseProxy = incoming.MedboomUseProxy;
|
||||
s.SmsEnabled = incoming.SmsEnabled;
|
||||
s.SmsApiKey = incoming.SmsApiKey?.Trim();
|
||||
s.SmsTemplate = incoming.SmsTemplate?.Trim();
|
||||
|
||||
Reference in New Issue
Block a user