using System.Text; using System.Text.Json; using JobsMedical.Web.Models; namespace JobsMedical.Web.Services.Scraping; /// /// Best-effort Divar fetch: queries Divar's web-search JSON for each term and harvests post /// titles + descriptions. Enabled + city + queries come from admin settings (DB). Divar's /// private API shifts shape, so we walk JSON tolerantly and fail soft. /// public class DivarListingSource : IListingSource { // Divar's web-search GET is anti-bot protected (returns a BLOCKING_VIEW). Their real search // is this POST endpoint, which returns POST_ROW widgets we can harvest. private const string SearchUrl = "https://api.divar.ir/v8/postlist/w/search"; private const string Ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36"; private readonly ScrapeHttpClients _clients; private readonly ILogger _log; public DivarListingSource(ScrapeHttpClients clients, ILogger log) { _clients = clients; _log = log; } public string Name => "دیوار"; public async Task> FetchAsync(AppSetting s, CancellationToken ct = default) { var queries = AppSetting.SplitList(s.DivarQueries); if (!s.DivarEnabled || queries.Count == 0) return Array.Empty(); var cityId = CityId(s.DivarCity); var client = _clients.For(s, s.DivarUseProxy); var items = new List(); foreach (var q in queries) { try { var payload = JsonSerializer.Serialize(new { city_ids = new[] { cityId }, search_data = new { form_data = new { data = new { category = new { str = new { value = "jobs" } } } }, query = q } }); using var req = new HttpRequestMessage(HttpMethod.Post, SearchUrl) { Content = new StringContent(payload, Encoding.UTF8, "application/json") }; req.Headers.TryAddWithoutValidation("User-Agent", Ua); using var resp = await client.SendAsync(req, ct); var body = await resp.Content.ReadAsStringAsync(ct); if (!resp.IsSuccessStatusCode || body.Contains("BLOCKING_VIEW")) { _log.LogWarning("Divar blocked/failed for query {Query} (HTTP {Status})", q, (int)resp.StatusCode); continue; } using var doc = JsonDocument.Parse(body); foreach (var text in Harvest(doc.RootElement).Take(25)) items.Add(new ScrapedItem("دیوار", text, "https://divar.ir")); } catch (Exception ex) { _log.LogWarning(ex, "Divar fetch failed for query {Query}", q); } } return items; } /// Divar uses numeric city IDs in its API. Pass a number through; map common slugs; /// default to Tehran (1). Admin can enter the numeric id directly in settings. private static string CityId(string? city) { city = (city ?? "").Trim().ToLowerInvariant(); if (int.TryParse(city, out _)) return city; return city switch { "tehran" or "تهران" => "1", "isfahan" or "esfahan" or "اصفهان" => "3", "mashhad" or "مشهد" => "4", "shiraz" or "شیراز" => "5", "tabriz" or "تبریز" => "6", "karaj" or "کرج" => "1745", _ => "1", }; } private static readonly string[] DescKeys = { "description", "middle_description_text", "subtitle", "bottom_description_text", "normal_text" }; private static IEnumerable Harvest(JsonElement el) { if (el.ValueKind == JsonValueKind.Object) { if (el.TryGetProperty("title", out var t) && t.ValueKind == JsonValueKind.String) { var sb = new StringBuilder(t.GetString()); foreach (var k in DescKeys) if (el.TryGetProperty(k, out var d) && d.ValueKind == JsonValueKind.String) { sb.Append(" — ").Append(d.GetString()); break; } var text = sb.ToString().Trim(); if (text.Length >= 15) yield return text; } foreach (var p in el.EnumerateObject()) foreach (var x in Harvest(p.Value)) yield return x; } else if (el.ValueKind == JsonValueKind.Array) { foreach (var item in el.EnumerateArray()) foreach (var x in Harvest(item)) yield return x; } } }