[Ingest] Route scraping through an optional V2Ray/Xray proxy (Telegram in Iran)
CI/CD / CI · dotnet build (push) Successful in 53s
CI/CD / Deploy · hamkadr (push) Successful in 1m12s

Telegram and some sources are filtered in Iran. .NET cannot speak vmess/vless/trojan, so add an Xray sidecar (compose service 'xray', behind the 'proxy' profile) that converts the admin's config into a local SOCKS5 proxy (xray:10808). New ScrapeHttpClients provider builds a proxied or direct HttpClient (WebProxy supports socks5/socks4/http) cached per proxy URL; all five ingestion sources (Telegram/Bale/Divar/Medjobs/Websites) now use it. Admin settings gain IngestProxyEnabled + IngestProxyUrl (migration; UI under sources). Added deploy/xray/config.json template + README with vmess/vless/trojan examples.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-04 17:53:17 +03:30
parent 698565c460
commit cea27c8684
17 changed files with 1411 additions and 20 deletions
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,40 @@
using Microsoft.EntityFrameworkCore.Migrations;
#nullable disable
namespace JobsMedical.Web.Migrations
{
/// <inheritdoc />
public partial class IngestProxy : Migration
{
/// <inheritdoc />
protected override void Up(MigrationBuilder migrationBuilder)
{
migrationBuilder.AddColumn<bool>(
name: "IngestProxyEnabled",
table: "AppSettings",
type: "boolean",
nullable: false,
defaultValue: false);
migrationBuilder.AddColumn<string>(
name: "IngestProxyUrl",
table: "AppSettings",
type: "character varying(200)",
maxLength: 200,
nullable: true);
}
/// <inheritdoc />
protected override void Down(MigrationBuilder migrationBuilder)
{
migrationBuilder.DropColumn(
name: "IngestProxyEnabled",
table: "AppSettings");
migrationBuilder.DropColumn(
name: "IngestProxyUrl",
table: "AppSettings");
}
}
}
@@ -83,6 +83,13 @@ namespace JobsMedical.Web.Migrations
b.Property<int>("IngestIntervalMinutes")
.HasColumnType("integer");
b.Property<bool>("IngestProxyEnabled")
.HasColumnType("boolean");
b.Property<string>("IngestProxyUrl")
.HasMaxLength(200)
.HasColumnType("character varying(200)");
b.Property<bool>("MedjobsEnabled")
.HasColumnType("boolean");
+7
View File
@@ -51,6 +51,13 @@ public class AppSetting
/// <summary>Generic web pages to scrape, one URL per line.</summary>
[MaxLength(4000)] public string? WebsiteUrls { get; set; }
/// <summary>Route ingestion fetches through a proxy (needed in Iran for Telegram etc.).</summary>
public bool IngestProxyEnabled { get; set; } = false;
/// <summary>Local proxy an Xray/V2Ray client sidecar exposes, e.g. socks5://xray:10808
/// (also accepts socks4:// or http://). The app cannot read vmess/vless/trojan directly;
/// the sidecar converts that config into this local proxy.</summary>
[MaxLength(200)] public string? IngestProxyUrl { get; set; }
public bool DivarEnabled { get; set; } = false;
[MaxLength(60)] public string? DivarCity { get; set; } = "tehran";
/// <summary>Divar search terms, one per line or comma-separated.</summary>
@@ -168,6 +168,16 @@
<p class="muted" style="font-size:12px; margin:4px 0 0;">موتور هر آدرس را می‌خواند و متن آگهی را استخراج می‌کند (عنوان og + بدنه محتوا). برای هر صفحه شغلی، آرشیو کانال یا آگهی طبقه‌بندی.</p>
</div>
<div class="filter-group">
<label style="display:flex; align-items:center; gap:8px; font-weight:700;">
<input type="checkbox" name="IngestProxyEnabled" value="true" style="width:auto;" checked="@Model.IngestProxyEnabled" />
ارسال جمع‌آوری از طریق پروکسی (برای دسترسی به تلگرام و … در ایران)
</label>
<label style="margin-top:6px;">آدرس پروکسی محلی</label>
<input type="text" name="IngestProxyUrl" value="@Model.IngestProxyUrl" dir="ltr" placeholder="socks5://xray:10808" />
<p class="muted" style="font-size:12px; margin:4px 0 0;">یک کلاینت Xray/V2Ray (سرویس جانبی) کانفیگ vmess/vless/trojan تو را به یک پروکسی محلی SOCKS تبدیل می‌کند؛ آدرس همان را اینجا بگذار (socks5:// یا socks4:// یا http://).</p>
</div>
<hr style="border:none; border-top:1px solid var(--line); margin:18px 0;" />
<h3 style="margin-top:0;">حالت نمایشی (Demo)</h3>
@@ -55,6 +55,8 @@ public class SettingsModel : PageModel
[BindProperty] public bool DemoMode { get; set; }
[BindProperty] public bool WebsitesEnabled { get; set; }
[BindProperty] public string? WebsiteUrls { get; set; }
[BindProperty] public bool IngestProxyEnabled { get; set; }
[BindProperty] public string? IngestProxyUrl { get; set; }
[TempData] public string? Saved { get; set; }
[TempData] public string? SmsTest { get; set; }
[TempData] public string? DemoMsg { get; set; }
@@ -89,6 +91,8 @@ public class SettingsModel : PageModel
DemoMode = s.DemoMode;
WebsitesEnabled = s.WebsitesEnabled;
WebsiteUrls = s.WebsiteUrls;
IngestProxyEnabled = s.IngestProxyEnabled;
IngestProxyUrl = s.IngestProxyUrl;
WebNotificationsEnabled = s.WebNotificationsEnabled;
PushEnabled = s.PushEnabled;
VapidPublicKey = s.VapidPublicKey;
@@ -127,6 +131,8 @@ public class SettingsModel : PageModel
DemoMode = DemoMode,
WebsitesEnabled = WebsitesEnabled,
WebsiteUrls = WebsiteUrls,
IngestProxyEnabled = IngestProxyEnabled,
IngestProxyUrl = IngestProxyUrl,
WebNotificationsEnabled = WebNotificationsEnabled,
PushEnabled = PushEnabled,
VapidPublicKey = VapidPublicKey,
+2
View File
@@ -37,6 +37,8 @@ builder.Services.AddHttpClient("scrape", c =>
c.DefaultRequestHeaders.UserAgent.ParseAdd("Mozilla/5.0 (compatible; HamkadrBot/1.0)");
});
builder.Services.AddHttpClient("ai");
// Proxy-aware client provider for ingestion (routes through Xray/V2Ray SOCKS proxy when set).
builder.Services.AddSingleton<JobsMedical.Web.Services.Scraping.ScrapeHttpClients>();
builder.Services.AddSingleton<JobsMedical.Web.Services.Scraping.ListingValidator>();
builder.Services.AddSingleton<JobsMedical.Web.Services.Scraping.IAiAuditor,
JobsMedical.Web.Services.Scraping.OpenAiCompatibleAuditor>();
@@ -10,12 +10,12 @@ namespace JobsMedical.Web.Services.Scraping;
public class BaleListingSource : IListingSource
{
private const string BaseUrl = "https://tapi.bale.ai";
private readonly IHttpClientFactory _http;
private readonly ScrapeHttpClients _clients;
private readonly ILogger<BaleListingSource> _log;
public BaleListingSource(IHttpClientFactory http, ILogger<BaleListingSource> log)
public BaleListingSource(ScrapeHttpClients clients, ILogger<BaleListingSource> log)
{
_http = http;
_clients = clients;
_log = log;
}
@@ -27,7 +27,7 @@ public class BaleListingSource : IListingSource
try
{
var client = _http.CreateClient("scrape");
var client = _clients.For(s);
var body = await client.GetStringAsync($"{BaseUrl}/bot{s.BaleBotToken}/getUpdates", ct);
using var doc = JsonDocument.Parse(body);
if (!doc.RootElement.TryGetProperty("result", out var result) || result.ValueKind != JsonValueKind.Array)
@@ -12,12 +12,12 @@ namespace JobsMedical.Web.Services.Scraping;
public class DivarListingSource : IListingSource
{
private const string BaseUrl = "https://api.divar.ir/v8/web-search";
private readonly IHttpClientFactory _http;
private readonly ScrapeHttpClients _clients;
private readonly ILogger<DivarListingSource> _log;
public DivarListingSource(IHttpClientFactory http, ILogger<DivarListingSource> log)
public DivarListingSource(ScrapeHttpClients clients, ILogger<DivarListingSource> log)
{
_http = http;
_clients = clients;
_log = log;
}
@@ -29,7 +29,7 @@ public class DivarListingSource : IListingSource
if (!s.DivarEnabled || queries.Count == 0) return Array.Empty<ScrapedItem>();
var city = string.IsNullOrWhiteSpace(s.DivarCity) ? "tehran" : s.DivarCity.Trim();
var client = _http.CreateClient("scrape");
var client = _clients.For(s);
var items = new List<ScrapedItem>();
foreach (var q in queries)
{
@@ -13,12 +13,12 @@ namespace JobsMedical.Web.Services.Scraping;
public class MedjobsListingSource : IListingSource
{
private const string SitemapIndex = "https://medjobs.ir/sitemap_index.xml";
private readonly IHttpClientFactory _http;
private readonly ScrapeHttpClients _clients;
private readonly ILogger<MedjobsListingSource> _log;
public MedjobsListingSource(IHttpClientFactory http, ILogger<MedjobsListingSource> log)
public MedjobsListingSource(ScrapeHttpClients clients, ILogger<MedjobsListingSource> log)
{
_http = http;
_clients = clients;
_log = log;
}
@@ -28,7 +28,7 @@ public class MedjobsListingSource : IListingSource
{
if (!s.MedjobsEnabled) return Array.Empty<ScrapedItem>();
var max = Math.Clamp(s.MedjobsMaxAds, 1, 500);
var client = _http.CreateClient("scrape");
var client = _clients.For(s);
try
{
@@ -0,0 +1,55 @@
using System.Collections.Concurrent;
using System.Net;
using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services.Scraping;
/// <summary>
/// Supplies the HttpClient used by ingestion sources, optionally routed through a proxy.
///
/// Telegram (t.me) and some other sources are filtered in Iran, so the admin can point
/// ingestion at a local proxy that an Xray/V2Ray client sidecar exposes (e.g.
/// <c>socks5://xray:10808</c>). .NET's WebProxy understands <c>socks5://</c>, <c>socks4://</c>
/// and <c>http://</c> schemes, so the same code path covers all of them.
///
/// Clients are cached per proxy descriptor (singleton). Changing the proxy in admin settings
/// makes the next run pick up a new client; the old one is disposed.
/// </summary>
public sealed class ScrapeHttpClients : IDisposable
{
private readonly ConcurrentDictionary<string, HttpClient> _cache = new();
/// <summary>The HttpClient for the given settings — proxied when enabled, direct otherwise.</summary>
public HttpClient For(AppSetting s)
{
var key = (s.IngestProxyEnabled && !string.IsNullOrWhiteSpace(s.IngestProxyUrl))
? s.IngestProxyUrl.Trim()
: "direct";
// Drop stale clients if the proxy URL changed (keep only "direct" + the current proxy).
foreach (var k in _cache.Keys)
if (k != "direct" && k != key && _cache.TryRemove(k, out var stale))
stale.Dispose();
return _cache.GetOrAdd(key, Build);
}
private static HttpClient Build(string key)
{
var handler = new HttpClientHandler { AutomaticDecompression = DecompressionMethods.All };
if (key != "direct")
{
handler.Proxy = new WebProxy(key); // socks5:// | socks4:// | http://
handler.UseProxy = true;
}
var c = new HttpClient(handler) { Timeout = TimeSpan.FromSeconds(20) };
c.DefaultRequestHeaders.UserAgent.ParseAdd("Mozilla/5.0 (compatible; HamkadrBot/1.0)");
return c;
}
public void Dispose()
{
foreach (var c in _cache.Values) c.Dispose();
_cache.Clear();
}
}
@@ -44,6 +44,8 @@ public class SettingsService
s.DemoMode = incoming.DemoMode;
s.WebsitesEnabled = incoming.WebsitesEnabled;
s.WebsiteUrls = incoming.WebsiteUrls?.Trim();
s.IngestProxyEnabled = incoming.IngestProxyEnabled;
s.IngestProxyUrl = incoming.IngestProxyUrl?.Trim();
s.DivarEnabled = incoming.DivarEnabled;
s.DivarCity = string.IsNullOrWhiteSpace(incoming.DivarCity) ? "tehran" : incoming.DivarCity.Trim();
s.DivarQueries = incoming.DivarQueries?.Trim();
@@ -10,12 +10,12 @@ namespace JobsMedical.Web.Services.Scraping;
/// </summary>
public class TelegramListingSource : IListingSource
{
private readonly IHttpClientFactory _http;
private readonly ScrapeHttpClients _clients;
private readonly ILogger<TelegramListingSource> _log;
public TelegramListingSource(IHttpClientFactory http, ILogger<TelegramListingSource> log)
public TelegramListingSource(ScrapeHttpClients clients, ILogger<TelegramListingSource> log)
{
_http = http;
_clients = clients;
_log = log;
}
@@ -26,7 +26,7 @@ public class TelegramListingSource : IListingSource
var channels = AppSetting.SplitList(s.TelegramChannels);
if (!s.TelegramEnabled || channels.Count == 0) return Array.Empty<ScrapedItem>();
var client = _http.CreateClient("scrape");
var client = _clients.For(s);
var items = new List<ScrapedItem>();
foreach (var ch in channels.Select(c => c.TrimStart('@')).Where(c => c.Length > 0))
{
@@ -11,12 +11,12 @@ namespace JobsMedical.Web.Services.Scraping;
/// </summary>
public class WebsiteListingSource : IListingSource
{
private readonly IHttpClientFactory _http;
private readonly ScrapeHttpClients _clients;
private readonly ILogger<WebsiteListingSource> _log;
public WebsiteListingSource(IHttpClientFactory http, ILogger<WebsiteListingSource> log)
public WebsiteListingSource(ScrapeHttpClients clients, ILogger<WebsiteListingSource> log)
{
_http = http;
_clients = clients;
_log = log;
}
@@ -27,7 +27,7 @@ public class WebsiteListingSource : IListingSource
var urls = AppSetting.SplitList(s.WebsiteUrls);
if (!s.WebsitesEnabled || urls.Count == 0) return Array.Empty<ScrapedItem>();
var client = _http.CreateClient("scrape");
var client = _clients.For(s);
var items = new List<ScrapedItem>();
foreach (var url in urls.Where(u => u.StartsWith("http")))
{