Files
hamkadr/src/JobsMedical.Web/Services/Scraping/DivarListingSource.cs
T

114 lines
4.7 KiB
C#
Raw Normal View History

using System.Text;
using System.Text.Json;
using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services.Scraping;
/// <summary>
/// Best-effort Divar fetch: queries Divar's web-search JSON for each term and harvests post
/// titles + descriptions. Enabled + city + queries come from admin settings (DB). Divar's
/// private API shifts shape, so we walk JSON tolerantly and fail soft.
/// </summary>
public class DivarListingSource : IListingSource
{
// Divar's web-search GET is anti-bot protected (returns a BLOCKING_VIEW). Their real search
// is this POST endpoint, which returns POST_ROW widgets we can harvest.
private const string SearchUrl = "https://api.divar.ir/v8/postlist/w/search";
private const string Ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36";
private readonly ScrapeHttpClients _clients;
private readonly ILogger<DivarListingSource> _log;
public DivarListingSource(ScrapeHttpClients clients, ILogger<DivarListingSource> log)
{
_clients = clients;
_log = log;
}
public string Name => "دیوار";
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default)
{
var queries = AppSetting.SplitList(s.DivarQueries);
if (!s.DivarEnabled || queries.Count == 0) return Array.Empty<ScrapedItem>();
var cityId = CityId(s.DivarCity);
var client = _clients.For(s, s.DivarUseProxy);
var items = new List<ScrapedItem>();
foreach (var q in queries)
{
try
{
var payload = JsonSerializer.Serialize(new
{
city_ids = new[] { cityId },
search_data = new
{
form_data = new { data = new { category = new { str = new { value = "jobs" } } } },
query = q
}
});
using var req = new HttpRequestMessage(HttpMethod.Post, SearchUrl)
{ Content = new StringContent(payload, Encoding.UTF8, "application/json") };
req.Headers.TryAddWithoutValidation("User-Agent", Ua);
using var resp = await client.SendAsync(req, ct);
var body = await resp.Content.ReadAsStringAsync(ct);
if (!resp.IsSuccessStatusCode || body.Contains("BLOCKING_VIEW"))
{
_log.LogWarning("Divar blocked/failed for query {Query} (HTTP {Status})", q, (int)resp.StatusCode);
continue;
}
using var doc = JsonDocument.Parse(body);
foreach (var text in Harvest(doc.RootElement).Take(25))
items.Add(new ScrapedItem("دیوار", text, "https://divar.ir"));
}
catch (Exception ex) { _log.LogWarning(ex, "Divar fetch failed for query {Query}", q); }
}
return items;
}
/// <summary>Divar uses numeric city IDs in its API. Pass a number through; map common slugs;
/// default to Tehran (1). Admin can enter the numeric id directly in settings.</summary>
private static string CityId(string? city)
{
city = (city ?? "").Trim().ToLowerInvariant();
if (int.TryParse(city, out _)) return city;
return city switch
{
"tehran" or "تهران" => "1",
"isfahan" or "esfahan" or "اصفهان" => "3",
"mashhad" or "مشهد" => "4",
"shiraz" or "شیراز" => "5",
"tabriz" or "تبریز" => "6",
"karaj" or "کرج" => "1745",
_ => "1",
};
}
private static readonly string[] DescKeys =
{ "description", "middle_description_text", "subtitle", "bottom_description_text", "normal_text" };
private static IEnumerable<string> Harvest(JsonElement el)
{
if (el.ValueKind == JsonValueKind.Object)
{
if (el.TryGetProperty("title", out var t) && t.ValueKind == JsonValueKind.String)
{
var sb = new StringBuilder(t.GetString());
foreach (var k in DescKeys)
if (el.TryGetProperty(k, out var d) && d.ValueKind == JsonValueKind.String)
{ sb.Append(" — ").Append(d.GetString()); break; }
var text = sb.ToString().Trim();
if (text.Length >= 15) yield return text;
}
foreach (var p in el.EnumerateObject())
foreach (var x in Harvest(p.Value)) yield return x;
}
else if (el.ValueKind == JsonValueKind.Array)
{
foreach (var item in el.EnumerateArray())
foreach (var x in Harvest(item)) yield return x;
}
}
}