2026-06-03 17:41:02 +03:30
|
|
|
|
using System.Text;
|
|
|
|
|
|
using System.Text.Json;
|
2026-06-04 00:44:11 +03:30
|
|
|
|
using JobsMedical.Web.Models;
|
2026-06-03 08:18:19 +03:30
|
|
|
|
|
|
|
|
|
|
namespace JobsMedical.Web.Services.Scraping;
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
2026-06-03 17:41:02 +03:30
|
|
|
|
/// Best-effort Divar fetch: queries Divar's web-search JSON for each term and harvests post
|
2026-06-04 00:44:11 +03:30
|
|
|
|
/// titles + descriptions. Enabled + city + queries come from admin settings (DB). Divar's
|
|
|
|
|
|
/// private API shifts shape, so we walk JSON tolerantly and fail soft.
|
2026-06-03 08:18:19 +03:30
|
|
|
|
/// </summary>
|
|
|
|
|
|
public class DivarListingSource : IListingSource
|
|
|
|
|
|
{
|
2026-06-07 21:23:36 +03:30
|
|
|
|
// Divar's web-search GET is anti-bot protected (returns a BLOCKING_VIEW). Their real search
|
|
|
|
|
|
// is this POST endpoint, which returns POST_ROW widgets we can harvest.
|
|
|
|
|
|
private const string SearchUrl = "https://api.divar.ir/v8/postlist/w/search";
|
|
|
|
|
|
private const string Ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36";
|
2026-06-04 17:53:17 +03:30
|
|
|
|
private readonly ScrapeHttpClients _clients;
|
2026-06-03 08:18:19 +03:30
|
|
|
|
private readonly ILogger<DivarListingSource> _log;
|
|
|
|
|
|
|
2026-06-04 17:53:17 +03:30
|
|
|
|
public DivarListingSource(ScrapeHttpClients clients, ILogger<DivarListingSource> log)
|
2026-06-03 08:18:19 +03:30
|
|
|
|
{
|
2026-06-04 17:53:17 +03:30
|
|
|
|
_clients = clients;
|
2026-06-03 08:18:19 +03:30
|
|
|
|
_log = log;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public string Name => "دیوار";
|
|
|
|
|
|
|
2026-06-04 00:44:11 +03:30
|
|
|
|
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default)
|
2026-06-03 08:18:19 +03:30
|
|
|
|
{
|
2026-06-04 00:44:11 +03:30
|
|
|
|
var queries = AppSetting.SplitList(s.DivarQueries);
|
|
|
|
|
|
if (!s.DivarEnabled || queries.Count == 0) return Array.Empty<ScrapedItem>();
|
2026-06-07 21:23:36 +03:30
|
|
|
|
var cityId = CityId(s.DivarCity);
|
2026-06-03 17:41:02 +03:30
|
|
|
|
|
2026-06-04 18:46:48 +03:30
|
|
|
|
var client = _clients.For(s, s.DivarUseProxy);
|
2026-06-03 17:41:02 +03:30
|
|
|
|
var items = new List<ScrapedItem>();
|
2026-06-04 00:44:11 +03:30
|
|
|
|
foreach (var q in queries)
|
2026-06-03 17:41:02 +03:30
|
|
|
|
{
|
|
|
|
|
|
try
|
|
|
|
|
|
{
|
2026-06-07 21:23:36 +03:30
|
|
|
|
var payload = JsonSerializer.Serialize(new
|
|
|
|
|
|
{
|
|
|
|
|
|
city_ids = new[] { cityId },
|
|
|
|
|
|
search_data = new
|
|
|
|
|
|
{
|
|
|
|
|
|
form_data = new { data = new { category = new { str = new { value = "jobs" } } } },
|
|
|
|
|
|
query = q
|
|
|
|
|
|
}
|
|
|
|
|
|
});
|
|
|
|
|
|
using var req = new HttpRequestMessage(HttpMethod.Post, SearchUrl)
|
|
|
|
|
|
{ Content = new StringContent(payload, Encoding.UTF8, "application/json") };
|
|
|
|
|
|
req.Headers.TryAddWithoutValidation("User-Agent", Ua);
|
|
|
|
|
|
|
|
|
|
|
|
using var resp = await client.SendAsync(req, ct);
|
|
|
|
|
|
var body = await resp.Content.ReadAsStringAsync(ct);
|
|
|
|
|
|
if (!resp.IsSuccessStatusCode || body.Contains("BLOCKING_VIEW"))
|
|
|
|
|
|
{
|
|
|
|
|
|
_log.LogWarning("Divar blocked/failed for query {Query} (HTTP {Status})", q, (int)resp.StatusCode);
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
2026-06-03 17:41:02 +03:30
|
|
|
|
using var doc = JsonDocument.Parse(body);
|
2026-06-08 08:28:37 +03:30
|
|
|
|
foreach (var (text, token) in Harvest(doc.RootElement).Take(25))
|
|
|
|
|
|
{
|
|
|
|
|
|
var url = token is not null ? $"https://divar.ir/v/{token}" : "https://divar.ir";
|
|
|
|
|
|
var withPhone = text;
|
|
|
|
|
|
if (token is not null)
|
|
|
|
|
|
{
|
|
|
|
|
|
var phones = await RevealPhonesAsync(client, token, s, ct);
|
|
|
|
|
|
if (phones.Count > 0 && !phones.Any(text.Contains))
|
|
|
|
|
|
withPhone = text + "\nشماره تماس: " + string.Join("، ", phones);
|
|
|
|
|
|
}
|
|
|
|
|
|
items.Add(new ScrapedItem("دیوار", withPhone, url));
|
|
|
|
|
|
}
|
2026-06-03 17:41:02 +03:30
|
|
|
|
}
|
|
|
|
|
|
catch (Exception ex) { _log.LogWarning(ex, "Divar fetch failed for query {Query}", q); }
|
|
|
|
|
|
}
|
|
|
|
|
|
return items;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-07 21:23:36 +03:30
|
|
|
|
/// <summary>Divar uses numeric city IDs in its API. Pass a number through; map common slugs;
|
|
|
|
|
|
/// default to Tehran (1). Admin can enter the numeric id directly in settings.</summary>
|
|
|
|
|
|
private static string CityId(string? city)
|
|
|
|
|
|
{
|
|
|
|
|
|
city = (city ?? "").Trim().ToLowerInvariant();
|
|
|
|
|
|
if (int.TryParse(city, out _)) return city;
|
|
|
|
|
|
return city switch
|
|
|
|
|
|
{
|
|
|
|
|
|
"tehran" or "تهران" => "1",
|
|
|
|
|
|
"isfahan" or "esfahan" or "اصفهان" => "3",
|
|
|
|
|
|
"mashhad" or "مشهد" => "4",
|
|
|
|
|
|
"shiraz" or "شیراز" => "5",
|
|
|
|
|
|
"tabriz" or "تبریز" => "6",
|
|
|
|
|
|
"karaj" or "کرج" => "1745",
|
|
|
|
|
|
_ => "1",
|
|
|
|
|
|
};
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-08 08:28:37 +03:30
|
|
|
|
// The post detail endpoint returns the FULL description — many Divar job ads write the phone
|
|
|
|
|
|
// straight into the body, so we can harvest it without Divar's (login-gated) contact reveal.
|
|
|
|
|
|
private const string PostDetailUrl = "https://api.divar.ir/v8/posts-v2/web/";
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// Fetch a post's detail JSON and harvest any contact number it contains (mostly numbers the
|
|
|
|
|
|
/// poster wrote into the description). Divar's true "نمایش شماره" reveal is auth-gated; this
|
|
|
|
|
|
/// covers the common case where the number is in the ad text. Fails soft.
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
private async Task<List<string>> RevealPhonesAsync(HttpClient client, string token, AppSetting s, CancellationToken ct)
|
|
|
|
|
|
{
|
|
|
|
|
|
try
|
|
|
|
|
|
{
|
|
|
|
|
|
using var req = new HttpRequestMessage(HttpMethod.Get, PostDetailUrl + token);
|
|
|
|
|
|
req.Headers.TryAddWithoutValidation("User-Agent", Ua);
|
|
|
|
|
|
req.Headers.TryAddWithoutValidation("Accept", "application/json");
|
|
|
|
|
|
using var resp = await client.SendAsync(req, ct);
|
|
|
|
|
|
if (!resp.IsSuccessStatusCode) return new();
|
|
|
|
|
|
var body = await resp.Content.ReadAsStringAsync(ct);
|
|
|
|
|
|
if (body.Contains("BLOCKING_VIEW")) return new();
|
|
|
|
|
|
return HtmlUtil.HarvestPhones(body);
|
|
|
|
|
|
}
|
|
|
|
|
|
catch (Exception ex)
|
|
|
|
|
|
{
|
|
|
|
|
|
_log.LogWarning(ex, "Divar detail/reveal failed for {Token}", token);
|
|
|
|
|
|
return new();
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-03 17:41:02 +03:30
|
|
|
|
private static readonly string[] DescKeys =
|
|
|
|
|
|
{ "description", "middle_description_text", "subtitle", "bottom_description_text", "normal_text" };
|
|
|
|
|
|
|
2026-06-08 08:28:37 +03:30
|
|
|
|
private static IEnumerable<(string text, string? token)> Harvest(JsonElement el)
|
2026-06-03 17:41:02 +03:30
|
|
|
|
{
|
|
|
|
|
|
if (el.ValueKind == JsonValueKind.Object)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (el.TryGetProperty("title", out var t) && t.ValueKind == JsonValueKind.String)
|
|
|
|
|
|
{
|
|
|
|
|
|
var sb = new StringBuilder(t.GetString());
|
|
|
|
|
|
foreach (var k in DescKeys)
|
|
|
|
|
|
if (el.TryGetProperty(k, out var d) && d.ValueKind == JsonValueKind.String)
|
|
|
|
|
|
{ sb.Append(" — ").Append(d.GetString()); break; }
|
|
|
|
|
|
var text = sb.ToString().Trim();
|
2026-06-08 08:28:37 +03:30
|
|
|
|
if (text.Length >= 15) yield return (text, FindToken(el));
|
2026-06-03 17:41:02 +03:30
|
|
|
|
}
|
|
|
|
|
|
foreach (var p in el.EnumerateObject())
|
2026-06-04 00:44:11 +03:30
|
|
|
|
foreach (var x in Harvest(p.Value)) yield return x;
|
2026-06-03 17:41:02 +03:30
|
|
|
|
}
|
|
|
|
|
|
else if (el.ValueKind == JsonValueKind.Array)
|
2026-06-03 08:18:19 +03:30
|
|
|
|
{
|
2026-06-03 17:41:02 +03:30
|
|
|
|
foreach (var item in el.EnumerateArray())
|
2026-06-04 00:44:11 +03:30
|
|
|
|
foreach (var x in Harvest(item)) yield return x;
|
2026-06-03 08:18:19 +03:30
|
|
|
|
}
|
|
|
|
|
|
}
|
2026-06-08 08:28:37 +03:30
|
|
|
|
|
|
|
|
|
|
/// <summary>Find the post token within a widget object (Divar tokens: 6–16 alphanumerics).</summary>
|
|
|
|
|
|
private static string? FindToken(JsonElement el)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (el.ValueKind == JsonValueKind.Object)
|
|
|
|
|
|
{
|
|
|
|
|
|
foreach (var p in el.EnumerateObject())
|
|
|
|
|
|
if (p.NameEquals("token") && p.Value.ValueKind == JsonValueKind.String)
|
|
|
|
|
|
{
|
|
|
|
|
|
var v = p.Value.GetString();
|
|
|
|
|
|
if (v is not null && v.Length is >= 6 and <= 16 && v.All(char.IsLetterOrDigit)) return v;
|
|
|
|
|
|
}
|
|
|
|
|
|
foreach (var p in el.EnumerateObject())
|
|
|
|
|
|
{ var r = FindToken(p.Value); if (r is not null) return r; }
|
|
|
|
|
|
}
|
|
|
|
|
|
else if (el.ValueKind == JsonValueKind.Array)
|
|
|
|
|
|
foreach (var item in el.EnumerateArray())
|
|
|
|
|
|
{ var r = FindToken(item); if (r is not null) return r; }
|
|
|
|
|
|
return null;
|
|
|
|
|
|
}
|
2026-06-03 08:18:19 +03:30
|
|
|
|
}
|