Files
hamkadr/src/JobsMedical.Web/Services/Scraping/MedjobsListingSource.cs
T
soroush.asadi 213af9db48
CI/CD / CI · dotnet build (push) Successful in 2m37s
CI/CD / Deploy · hamkadr (push) Successful in 1m11s
AI tag/category assignment + phone extraction from web ads
AI (when enabled, now that the server proxy is up):
- AiStructured gains phone, personName, yearsExperience, isLicensed.
- The auditor appends an authoritative output-schema to the admin prompt
  so classification stays correct even with an older stored prompt — it
  now classifies kind as shift|job|talent and extracts the contact phone
  and talent details.
- Ingestion publish prefers the AI's tags (kind/role/city/facility/phone +
  talent fields) over the heuristic parser when present.
- Default prompt updated to describe the three kinds + new fields.

Phone extraction from websites (Medjobs / generic sites), where the
number sits behind a "تماس با این آگهی" reveal:
- HtmlUtil.HarvestPhones scans the full markup for tel: links, JSON-LD
  "telephone", data-*phone* attributes, and inline Iranian mobile/landline
  numbers (Persian digits folded), normalized (mobiles 09…, landlines 0…).
- Medjobs + Website sources append harvested numbers to the ad text so the
  parser/AI capture them; manual review then prefills the phone too.
- Parser phone extraction now also captures a landline as a fallback.

Note: if a site loads the number purely via XHR (not in HTML), a
per-source reveal endpoint would be a follow-up.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-08 08:11:14 +03:30

120 lines
5.1 KiB
C#

using System.Text.RegularExpressions;
using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services.Scraping;
/// <summary>
/// Scrapes job ads from medjobs.ir (a WordPress "ad_listing" classifieds site). It reads the
/// site's own sitemaps (sitemap_index.xml → ad_listing-sitemapN.xml) to enumerate every ad URL,
/// then fetches each ad page and extracts its title + description. The engine's content-hash
/// dedupe means each ad is only ever ingested once, so repeated runs pick up only new ads.
/// Published items become job pages on hamkadr.ir (the SEO goal).
/// </summary>
public class MedjobsListingSource : IListingSource
{
private const string SitemapIndex = "https://medjobs.ir/sitemap_index.xml";
private readonly ScrapeHttpClients _clients;
private readonly ILogger<MedjobsListingSource> _log;
public MedjobsListingSource(ScrapeHttpClients clients, ILogger<MedjobsListingSource> log)
{
_clients = clients;
_log = log;
}
public string Name => "مدجابز (medjobs.ir)";
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default)
{
if (!s.MedjobsEnabled) return Array.Empty<ScrapedItem>();
var max = Math.Clamp(s.MedjobsMaxAds, 1, 500);
var client = _clients.For(s, s.MedjobsUseProxy);
try
{
// 1. sitemap index → the ad_listing sitemaps
var index = await client.GetStringAsync(SitemapIndex, ct);
var adSitemaps = Locs(index).Where(u => u.Contains("ad_listing-sitemap")).ToList();
if (adSitemaps.Count == 0) { _log.LogWarning("medjobs: no ad_listing sitemaps found"); return Array.Empty<ScrapedItem>(); }
// 2. collect ad URLs (skip the bare /ads/ archive)
var adUrls = new List<string>();
foreach (var sm in adSitemaps)
{
if (adUrls.Count >= max) break;
try
{
var body = await client.GetStringAsync(sm, ct);
adUrls.AddRange(Locs(body).Where(u => u.Contains("/ads/") && !u.TrimEnd('/').EndsWith("/ads")));
}
catch (Exception ex) { _log.LogWarning(ex, "medjobs: sitemap {Sm} failed", sm); }
}
adUrls = adUrls.Distinct().Take(max).ToList();
// 3. fetch each ad page → title + description
var items = new List<ScrapedItem>();
foreach (var url in adUrls)
{
ct.ThrowIfCancellationRequested();
try
{
var html = await client.GetStringAsync(url, ct);
var text = ExtractAd(html);
if (text.Length >= 25) items.Add(new ScrapedItem("مدجابز", text, url));
}
catch (Exception ex) { _log.LogWarning(ex, "medjobs: ad {Url} failed", url); }
}
_log.LogInformation("medjobs: fetched {Count} ads", items.Count);
return items;
}
catch (Exception ex)
{
_log.LogWarning(ex, "medjobs fetch failed");
return Array.Empty<ScrapedItem>();
}
}
private static IEnumerable<string> Locs(string xml)
=> Regex.Matches(xml, "<loc>([^<]+)</loc>").Select(m => m.Groups[1].Value.Trim());
/// <summary>Title (og:title, site suffix stripped) + body (entry/description content or og:description).</summary>
private static string ExtractAd(string html)
{
var title = Meta(html, "og:title");
if (title is not null)
{
var bar = title.IndexOf('|');
if (bar > 10) title = title[..bar].Trim();
}
string? body = BetweenClass(html, "rtcl-description")
?? BetweenClass(html, "entry-content")
?? Meta(html, "og:description");
var parts = new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p));
var text = HtmlUtil.ToPlainText(string.Join("\n", parts));
if (text.Length > 1800) text = text[..1800];
// The contact number is often outside the description (in a tel: link / data attribute the
// page reveals on click). Harvest it from the full HTML and append so the parser/AI see it.
var phones = HtmlUtil.HarvestPhones(html);
if (phones.Count > 0 && !phones.Any(text.Contains))
text += "\nشماره تماس: " + string.Join("، ", phones);
return text;
}
private static string? Meta(string html, string prop)
{
var m = Regex.Match(html, $"<meta[^>]+property=[\"']{Regex.Escape(prop)}[\"'][^>]+content=[\"']([^\"']*)[\"']");
return m.Success ? System.Net.WebUtility.HtmlDecode(m.Groups[1].Value) : null;
}
/// <summary>Grab the inner HTML of the first &lt;div class="...name..."&gt; (best-effort).</summary>
private static string? BetweenClass(string html, string cls)
{
var m = Regex.Match(html, $"<div[^>]+class=[\"'][^\"']*{Regex.Escape(cls)}[^\"']*[\"'][^>]*>(.*?)</div>",
RegexOptions.Singleline);
return m.Success ? m.Groups[1].Value : null;
}
}