113 lines
4.7 KiB
C#
113 lines
4.7 KiB
C#
|
|
using System.Text.RegularExpressions;
|
||
|
|
using JobsMedical.Web.Models;
|
||
|
|
|
||
|
|
namespace JobsMedical.Web.Services.Scraping;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Scrapes job ads from medjobs.ir (a WordPress "ad_listing" classifieds site). It reads the
|
||
|
|
/// site's own sitemaps (sitemap_index.xml → ad_listing-sitemapN.xml) to enumerate every ad URL,
|
||
|
|
/// then fetches each ad page and extracts its title + description. The engine's content-hash
|
||
|
|
/// dedupe means each ad is only ever ingested once, so repeated runs pick up only new ads.
|
||
|
|
/// Published items become job pages on hamkadr.ir (the SEO goal).
|
||
|
|
/// </summary>
|
||
|
|
public class MedjobsListingSource : IListingSource
|
||
|
|
{
|
||
|
|
private const string SitemapIndex = "https://medjobs.ir/sitemap_index.xml";
|
||
|
|
private readonly IHttpClientFactory _http;
|
||
|
|
private readonly ILogger<MedjobsListingSource> _log;
|
||
|
|
|
||
|
|
public MedjobsListingSource(IHttpClientFactory http, ILogger<MedjobsListingSource> log)
|
||
|
|
{
|
||
|
|
_http = http;
|
||
|
|
_log = log;
|
||
|
|
}
|
||
|
|
|
||
|
|
public string Name => "مدجابز (medjobs.ir)";
|
||
|
|
|
||
|
|
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default)
|
||
|
|
{
|
||
|
|
if (!s.MedjobsEnabled) return Array.Empty<ScrapedItem>();
|
||
|
|
var max = Math.Clamp(s.MedjobsMaxAds, 1, 500);
|
||
|
|
var client = _http.CreateClient("scrape");
|
||
|
|
|
||
|
|
try
|
||
|
|
{
|
||
|
|
// 1. sitemap index → the ad_listing sitemaps
|
||
|
|
var index = await client.GetStringAsync(SitemapIndex, ct);
|
||
|
|
var adSitemaps = Locs(index).Where(u => u.Contains("ad_listing-sitemap")).ToList();
|
||
|
|
if (adSitemaps.Count == 0) { _log.LogWarning("medjobs: no ad_listing sitemaps found"); return Array.Empty<ScrapedItem>(); }
|
||
|
|
|
||
|
|
// 2. collect ad URLs (skip the bare /ads/ archive)
|
||
|
|
var adUrls = new List<string>();
|
||
|
|
foreach (var sm in adSitemaps)
|
||
|
|
{
|
||
|
|
if (adUrls.Count >= max) break;
|
||
|
|
try
|
||
|
|
{
|
||
|
|
var body = await client.GetStringAsync(sm, ct);
|
||
|
|
adUrls.AddRange(Locs(body).Where(u => u.Contains("/ads/") && !u.TrimEnd('/').EndsWith("/ads")));
|
||
|
|
}
|
||
|
|
catch (Exception ex) { _log.LogWarning(ex, "medjobs: sitemap {Sm} failed", sm); }
|
||
|
|
}
|
||
|
|
adUrls = adUrls.Distinct().Take(max).ToList();
|
||
|
|
|
||
|
|
// 3. fetch each ad page → title + description
|
||
|
|
var items = new List<ScrapedItem>();
|
||
|
|
foreach (var url in adUrls)
|
||
|
|
{
|
||
|
|
ct.ThrowIfCancellationRequested();
|
||
|
|
try
|
||
|
|
{
|
||
|
|
var html = await client.GetStringAsync(url, ct);
|
||
|
|
var text = ExtractAd(html);
|
||
|
|
if (text.Length >= 25) items.Add(new ScrapedItem("مدجابز", text, url));
|
||
|
|
}
|
||
|
|
catch (Exception ex) { _log.LogWarning(ex, "medjobs: ad {Url} failed", url); }
|
||
|
|
}
|
||
|
|
_log.LogInformation("medjobs: fetched {Count} ads", items.Count);
|
||
|
|
return items;
|
||
|
|
}
|
||
|
|
catch (Exception ex)
|
||
|
|
{
|
||
|
|
_log.LogWarning(ex, "medjobs fetch failed");
|
||
|
|
return Array.Empty<ScrapedItem>();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
private static IEnumerable<string> Locs(string xml)
|
||
|
|
=> Regex.Matches(xml, "<loc>([^<]+)</loc>").Select(m => m.Groups[1].Value.Trim());
|
||
|
|
|
||
|
|
/// <summary>Title (og:title, site suffix stripped) + body (entry/description content or og:description).</summary>
|
||
|
|
private static string ExtractAd(string html)
|
||
|
|
{
|
||
|
|
var title = Meta(html, "og:title");
|
||
|
|
if (title is not null)
|
||
|
|
{
|
||
|
|
var bar = title.IndexOf('|');
|
||
|
|
if (bar > 10) title = title[..bar].Trim();
|
||
|
|
}
|
||
|
|
|
||
|
|
string? body = BetweenClass(html, "rtcl-description")
|
||
|
|
?? BetweenClass(html, "entry-content")
|
||
|
|
?? Meta(html, "og:description");
|
||
|
|
|
||
|
|
var parts = new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p));
|
||
|
|
var text = HtmlUtil.ToPlainText(string.Join("\n", parts));
|
||
|
|
return text.Length > 1800 ? text[..1800] : text;
|
||
|
|
}
|
||
|
|
|
||
|
|
private static string? Meta(string html, string prop)
|
||
|
|
{
|
||
|
|
var m = Regex.Match(html, $"<meta[^>]+property=[\"']{Regex.Escape(prop)}[\"'][^>]+content=[\"']([^\"']*)[\"']");
|
||
|
|
return m.Success ? System.Net.WebUtility.HtmlDecode(m.Groups[1].Value) : null;
|
||
|
|
}
|
||
|
|
|
||
|
|
/// <summary>Grab the inner HTML of the first <div class="...name..."> (best-effort).</summary>
|
||
|
|
private static string? BetweenClass(string html, string cls)
|
||
|
|
{
|
||
|
|
var m = Regex.Match(html, $"<div[^>]+class=[\"'][^\"']*{Regex.Escape(cls)}[^\"']*[\"'][^>]*>(.*?)</div>",
|
||
|
|
RegexOptions.Singleline);
|
||
|
|
return m.Success ? m.Groups[1].Value : null;
|
||
|
|
}
|
||
|
|
}
|