Move ingestion + Telegram/Bale/Divar config to DB-backed admin settings
CI/CD / CI · dotnet build (push) Successful in 6m22s
CI/CD / Deploy · hamkadr (push) Failing after 3s

- AppSetting gains source config: AutoIngestEnabled, IngestIntervalMinutes, Telegram/Bale/Divar enabled+channels/token/queries
- IListingSource.FetchAsync(AppSetting) — sources read config from DB, not IOptions/appsettings; sample source dev-only
- IngestionWorker reads AutoIngest+interval from DB each cycle (toggle at runtime, no redeploy)
- /Admin/Settings gets a 'منابع جمع‌آوری' section; removed Ingestion env/appsettings + compose env vars
- ENV_FILE shrinks to HOST_PORT + POSTGRES_* + ADMIN_PHONE (AI + sources are all in-admin); migration

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-04 00:44:11 +03:30
parent 6cfdd16c42
commit 3c08c1a265
20 changed files with 1217 additions and 167 deletions
@@ -1,46 +1,34 @@
using System.Text.Json;
using Microsoft.Extensions.Options;
using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services.Scraping;
public class BaleOptions
{
public bool Enabled { get; set; }
public string? BotToken { get; set; }
public string BaseUrl { get; set; } = "https://tapi.bale.ai"; // Bale Bot API host
}
/// <summary>
/// Bale (Iranian messenger) source via its Telegram-compatible Bot API getUpdates. The bot must
/// be a member/admin of the channels it should read. Pulls text from messages and channel posts.
/// Bale (Iranian messenger) source via its Telegram-compatible Bot API getUpdates. Enabled +
/// bot token come from admin settings (DB). The bot must be a member of the channels it reads.
/// </summary>
public class BaleListingSource : IListingSource
{
private readonly BaleOptions _opts;
private const string BaseUrl = "https://tapi.bale.ai";
private readonly IHttpClientFactory _http;
private readonly ILogger<BaleListingSource> _log;
public BaleListingSource(IOptions<BaleOptions> opts, IHttpClientFactory http,
ILogger<BaleListingSource> log)
public BaleListingSource(IHttpClientFactory http, ILogger<BaleListingSource> log)
{
_opts = opts.Value;
_http = http;
_log = log;
}
public string Name => "بله";
public bool Enabled => _opts.Enabled && !string.IsNullOrWhiteSpace(_opts.BotToken);
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default)
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default)
{
if (!Enabled) { _log.LogInformation("Bale source disabled/unconfigured."); return Array.Empty<ScrapedItem>(); }
if (!s.BaleEnabled || string.IsNullOrWhiteSpace(s.BaleBotToken)) return Array.Empty<ScrapedItem>();
try
{
var client = _http.CreateClient("scrape");
var url = $"{_opts.BaseUrl.TrimEnd('/')}/bot{_opts.BotToken}/getUpdates";
var body = await client.GetStringAsync(url, ct);
var body = await client.GetStringAsync($"{BaseUrl}/bot{s.BaleBotToken}/getUpdates", ct);
using var doc = JsonDocument.Parse(body);
if (!doc.RootElement.TryGetProperty("result", out var result) || result.ValueKind != JsonValueKind.Array)
return Array.Empty<ScrapedItem>();
@@ -54,11 +42,7 @@ public class BaleListingSource : IListingSource
}
return items;
}
catch (Exception ex)
{
_log.LogWarning(ex, "Bale fetch failed.");
return Array.Empty<ScrapedItem>();
}
catch (Exception ex) { _log.LogWarning(ex, "Bale fetch failed."); return Array.Empty<ScrapedItem>(); }
}
private static string? TextOf(JsonElement update, string key)
@@ -1,55 +1,44 @@
using System.Text;
using System.Text.Json;
using Microsoft.Extensions.Options;
using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services.Scraping;
public class DivarOptions
{
public bool Enabled { get; set; }
public string City { get; set; } = "tehran";
public string Category { get; set; } = "jobs";
public List<string> Queries { get; set; } = new(); // e.g. "پرستار", "پزشک عمومی", "درمانگاه"
public string BaseUrl { get; set; } = "https://api.divar.ir/v8/web-search";
public int PerQuery { get; set; } = 25;
}
/// <summary>
/// Best-effort Divar fetch: queries Divar's web-search JSON for each term and harvests post
/// titles + descriptions. Divar's private API shifts shape over time, so we walk the JSON
/// tolerantly for any object carrying a "title" plus a nearby description field, and fail soft.
/// titles + descriptions. Enabled + city + queries come from admin settings (DB). Divar's
/// private API shifts shape, so we walk JSON tolerantly and fail soft.
/// </summary>
public class DivarListingSource : IListingSource
{
private readonly DivarOptions _opts;
private const string BaseUrl = "https://api.divar.ir/v8/web-search";
private readonly IHttpClientFactory _http;
private readonly ILogger<DivarListingSource> _log;
public DivarListingSource(IOptions<DivarOptions> opts, IHttpClientFactory http,
ILogger<DivarListingSource> log)
public DivarListingSource(IHttpClientFactory http, ILogger<DivarListingSource> log)
{
_opts = opts.Value;
_http = http;
_log = log;
}
public string Name => "دیوار";
public bool Enabled => _opts.Enabled && _opts.Queries.Count > 0;
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default)
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default)
{
if (!Enabled) { _log.LogInformation("Divar source disabled/unconfigured."); return Array.Empty<ScrapedItem>(); }
var queries = AppSetting.SplitList(s.DivarQueries);
if (!s.DivarEnabled || queries.Count == 0) return Array.Empty<ScrapedItem>();
var city = string.IsNullOrWhiteSpace(s.DivarCity) ? "tehran" : s.DivarCity.Trim();
var client = _http.CreateClient("scrape");
var items = new List<ScrapedItem>();
foreach (var q in _opts.Queries.Where(q => q.Trim().Length > 0))
foreach (var q in queries)
{
try
{
var url = $"{_opts.BaseUrl.TrimEnd('/')}/{_opts.City}/{_opts.Category}?q={Uri.EscapeDataString(q)}";
var url = $"{BaseUrl}/{city}/jobs?q={Uri.EscapeDataString(q)}";
var body = await client.GetStringAsync(url, ct);
using var doc = JsonDocument.Parse(body);
foreach (var text in Harvest(doc.RootElement).Take(_opts.PerQuery))
foreach (var text in Harvest(doc.RootElement).Take(25))
items.Add(new ScrapedItem("دیوار", text, "https://divar.ir"));
}
catch (Exception ex) { _log.LogWarning(ex, "Divar fetch failed for query {Query}", q); }
@@ -60,7 +49,6 @@ public class DivarListingSource : IListingSource
private static readonly string[] DescKeys =
{ "description", "middle_description_text", "subtitle", "bottom_description_text", "normal_text" };
/// <summary>Walk the JSON; for each object with a string "title", emit title + first description.</summary>
private static IEnumerable<string> Harvest(JsonElement el)
{
if (el.ValueKind == JsonValueKind.Object)
@@ -75,12 +63,12 @@ public class DivarListingSource : IListingSource
if (text.Length >= 15) yield return text;
}
foreach (var p in el.EnumerateObject())
foreach (var s in Harvest(p.Value)) yield return s;
foreach (var x in Harvest(p.Value)) yield return x;
}
else if (el.ValueKind == JsonValueKind.Array)
{
foreach (var item in el.EnumerateArray())
foreach (var s in Harvest(item)) yield return s;
foreach (var x in Harvest(item)) yield return x;
}
}
}
@@ -1,15 +1,17 @@
using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services.Scraping;
/// <summary>One raw post pulled from a source (a Telegram message, a Divar ad, etc.).</summary>
public record ScrapedItem(string Source, string RawText, string? SourceUrl = null);
/// <summary>
/// A pluggable source the ingestion engine pulls from. Implement once per channel/site.
/// `Enabled` lets a source be present but dormant until it's configured with credentials.
/// A pluggable source the ingestion engine pulls from. Configuration (enabled, channels, tokens)
/// comes from the DB-backed <see cref="AppSetting"/> passed in — set in the admin panel, not env.
/// A disabled/unconfigured source returns an empty list.
/// </summary>
public interface IListingSource
{
string Name { get; }
bool Enabled { get; }
Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default);
Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting settings, CancellationToken ct = default);
}
@@ -43,8 +43,7 @@ public class IngestionService
_ai = ai; _settings = settings; _log = log;
}
public IReadOnlyList<(string Name, bool Enabled)> Sources =>
_sources.Select(s => (s.Name, s.Enabled)).ToList();
public IReadOnlyList<string> SourceNames => _sources.Select(s => s.Name).ToList();
public async Task<IngestionSummary> RunAsync(CancellationToken ct = default)
{
@@ -58,12 +57,13 @@ public class IngestionService
var results = new List<SourceResult>();
foreach (var source in _sources.Where(s => s.Enabled))
foreach (var source in _sources)
{
int fetched = 0, queued = 0, published = 0, flagged = 0, spam = 0, dupes = 0;
IReadOnlyList<ScrapedItem> items;
try { items = await source.FetchAsync(ct); }
try { items = await source.FetchAsync(settings, ct); }
catch (Exception ex) { _log.LogError(ex, "Source {Source} failed", source.Name); continue; }
if (items.Count == 0) continue; // disabled/unconfigured source
foreach (var item in items)
{
@@ -1,58 +1,52 @@
using Microsoft.Extensions.Options;
namespace JobsMedical.Web.Services.Scraping;
public class IngestionOptions
{
public bool Enabled { get; set; } = false; // off by default — opt in via config
public int IntervalMinutes { get; set; } = 30;
}
/// <summary>
/// Periodically runs the ingestion engine when enabled (Ingestion:Enabled=true). Off by default
/// so nothing scrapes uninvited; admins can also trigger a run on demand from the admin UI.
/// Periodically runs the ingestion engine when the admin has turned auto-ingest ON
/// (AppSetting.AutoIngestEnabled) — read fresh from the DB each cycle, so it can be toggled at
/// runtime from the admin panel with no redeploy. When off, it idles and re-checks.
/// </summary>
public class IngestionWorker : BackgroundService
{
private readonly IServiceScopeFactory _scopes;
private readonly IngestionOptions _opts;
private readonly ILogger<IngestionWorker> _log;
public IngestionWorker(IServiceScopeFactory scopes, IOptions<IngestionOptions> opts,
ILogger<IngestionWorker> log)
public IngestionWorker(IServiceScopeFactory scopes, ILogger<IngestionWorker> log)
{
_scopes = scopes;
_opts = opts.Value;
_log = log;
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
if (!_opts.Enabled)
{
_log.LogInformation("Ingestion worker disabled (Ingestion:Enabled=false).");
return;
}
var interval = TimeSpan.FromMinutes(Math.Max(1, _opts.IntervalMinutes));
_log.LogInformation("Ingestion worker on; every {Min} min.", _opts.IntervalMinutes);
// Small startup delay so the DB/migrations are ready.
try { await Task.Delay(TimeSpan.FromSeconds(20), stoppingToken); }
catch (OperationCanceledException) { return; }
while (!stoppingToken.IsCancellationRequested)
{
var idleMinutes = 10;
try
{
using var scope = _scopes.CreateScope();
var svc = scope.ServiceProvider.GetRequiredService<IngestionService>();
var summary = await svc.RunAsync(stoppingToken);
_log.LogInformation("Scheduled ingestion: queued={Q} flagged={F} spam={S} dupes={D}",
summary.TotalQueued, summary.TotalFlagged, summary.TotalSpam, summary.TotalDuplicates);
var settings = await scope.ServiceProvider
.GetRequiredService<SettingsService>().GetAsync();
if (settings.AutoIngestEnabled)
{
var svc = scope.ServiceProvider.GetRequiredService<IngestionService>();
var summary = await svc.RunAsync(stoppingToken);
_log.LogInformation("Auto-ingest: queued={Q} published={P} flagged={F} spam={S} dupes={D}",
summary.TotalQueued, summary.TotalPublished, summary.TotalFlagged,
summary.TotalSpam, summary.TotalDuplicates);
idleMinutes = Math.Max(1, settings.IngestIntervalMinutes);
}
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
_log.LogError(ex, "Scheduled ingestion run failed");
_log.LogError(ex, "Auto-ingest cycle failed");
}
try { await Task.Delay(interval, stoppingToken); }
try { await Task.Delay(TimeSpan.FromMinutes(idleMinutes), stoppingToken); }
catch (OperationCanceledException) { break; }
}
}
@@ -1,27 +1,33 @@
using JobsMedical.Web.Models;
using Microsoft.Extensions.Hosting;
namespace JobsMedical.Web.Services.Scraping;
/// <summary>
/// A built-in source of representative Persian posts (the kind found in shift channels). Always
/// available, needs no credentials — it lets the whole ingestion → validation → review pipeline
/// run and be demoed today, and doubles as a fixture mix of good, incomplete, and spam posts.
/// Built-in representative Persian posts (good, incomplete, and spam) so the whole pipeline can be
/// demoed. Only active in Development — never injects sample data into production.
/// </summary>
public class SampleListingSource : IListingSource
{
private readonly IHostEnvironment _env;
public SampleListingSource(IHostEnvironment env) => _env = env;
public string Name => "نمونه (کانال آزمایشی)";
public bool Enabled => true;
private static readonly string[] Posts =
{
"درمانگاه شبانه‌روزی در سعادت‌آباد نیازمند پزشک عمومی برای شیفت شب، کارانه ۳ میلیون تومان. تماس ۰۹۱۲۳۴۵۶۷۸۹",
"کلینیک تخصصی در تهران به پرستار برای شیفت عصر نیازمند است، ۵۰٪ سهم درآمد. ۰۹۳۵۱۱۱۲۲۳۳",
"کلینیک تخصصی در تهران به پرستار خانم برای شیفت عصر نیازمند است، ۵۰٪ سهم درآمد. ۰۹۳۵۱۱۱۲۲۳۳",
"استخدام ماما تمام‌وقت در بیمارستان خصوصی، حقوق توافقی. منطقه شهرک غرب.",
"نیازمند تکنسین اتاق عمل جهت همکاری در نارمک، شیفت صبح. ۰۹۱۲۰۰۰۰۰۰۰",
"فروش فالوور و بک لینک ارزان، سرمایه گذاری در ارز دیجیتال با سود تضمینی!", // spam
"پزشک", // too short / incomplete
"نیازمند تکنسین اتاق عمل آقا جهت همکاری در نارمک، شیفت صبح. ۰۹۱۲۰۰۰۰۰۰۰",
"فروش فالوور و بک لینک ارزان، سرمایه گذاری در ارز دیجیتال با سود تضمینی!",
"پزشک",
"بیمارستان آتیه جهت تکمیل کادر درمان به پزشک عمومی مقیم نیازمند است. قرارداد یک‌ساله، حقوق ۴۵ میلیون ماهانه. تهرانپارس.",
};
public Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default)
public Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting settings, CancellationToken ct = default)
=> Task.FromResult<IReadOnlyList<ScrapedItem>>(
Posts.Select(p => new ScrapedItem(Name, p)).ToList());
_env.IsDevelopment()
? Posts.Select(p => new ScrapedItem(Name, p)).ToList()
: Array.Empty<ScrapedItem>());
}
@@ -34,6 +34,16 @@ public class SettingsService
s.AiSystemPrompt = string.IsNullOrWhiteSpace(incoming.AiSystemPrompt)
? AppSetting.DefaultPrompt : incoming.AiSystemPrompt;
s.AiAutoApprove = incoming.AiAutoApprove;
// Channel scraping sources
s.AutoIngestEnabled = incoming.AutoIngestEnabled;
s.IngestIntervalMinutes = Math.Max(1, incoming.IngestIntervalMinutes);
s.TelegramEnabled = incoming.TelegramEnabled;
s.TelegramChannels = incoming.TelegramChannels?.Trim();
s.BaleEnabled = incoming.BaleEnabled;
s.BaleBotToken = incoming.BaleBotToken?.Trim();
s.DivarEnabled = incoming.DivarEnabled;
s.DivarCity = string.IsNullOrWhiteSpace(incoming.DivarCity) ? "tehran" : incoming.DivarCity.Trim();
s.DivarQueries = incoming.DivarQueries?.Trim();
s.UpdatedAt = DateTime.UtcNow;
await _db.SaveChangesAsync();
}
@@ -1,50 +1,39 @@
using System.Net;
using System.Text.RegularExpressions;
using Microsoft.Extensions.Options;
using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services.Scraping;
public class TelegramOptions
{
public bool Enabled { get; set; }
public string? BotToken { get; set; } // optional (for private channels later)
public List<string> Channels { get; set; } = new(); // public channel usernames (no @)
public int PerChannel { get; set; } = 20;
}
/// <summary>
/// Reads public Telegram channels via the web preview (https://t.me/s/&lt;channel&gt;) — no bot
/// token or login needed for public channels. Each message's text becomes a ScrapedItem.
/// token needed for public channels. Enabled + channel list come from the admin settings (DB).
/// </summary>
public class TelegramListingSource : IListingSource
{
private readonly TelegramOptions _opts;
private readonly IHttpClientFactory _http;
private readonly ILogger<TelegramListingSource> _log;
public TelegramListingSource(IOptions<TelegramOptions> opts, IHttpClientFactory http,
ILogger<TelegramListingSource> log)
public TelegramListingSource(IHttpClientFactory http, ILogger<TelegramListingSource> log)
{
_opts = opts.Value;
_http = http;
_log = log;
}
public string Name => "تلگرام";
public bool Enabled => _opts.Enabled && _opts.Channels.Count > 0;
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default)
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(AppSetting s, CancellationToken ct = default)
{
if (!Enabled) { _log.LogInformation("Telegram source disabled/unconfigured."); return Array.Empty<ScrapedItem>(); }
var channels = AppSetting.SplitList(s.TelegramChannels);
if (!s.TelegramEnabled || channels.Count == 0) return Array.Empty<ScrapedItem>();
var client = _http.CreateClient("scrape");
var items = new List<ScrapedItem>();
foreach (var ch in _opts.Channels.Select(c => c.TrimStart('@')).Where(c => c.Length > 0))
foreach (var ch in channels.Select(c => c.TrimStart('@')).Where(c => c.Length > 0))
{
try
{
var html = await client.GetStringAsync($"https://t.me/s/{ch}", ct);
foreach (var text in ExtractMessages(html).Take(_opts.PerChannel))
foreach (var text in ExtractMessages(html).Take(20))
items.Add(new ScrapedItem($"تلگرام/{ch}", text, $"https://t.me/{ch}"));
}
catch (Exception ex) { _log.LogWarning(ex, "Telegram fetch failed for {Channel}", ch); }
@@ -52,7 +41,6 @@ public class TelegramListingSource : IListingSource
return items;
}
// Message bodies live in <div class="tgme_widget_message_text ...">...</div>.
private static IEnumerable<string> ExtractMessages(string html)
{
foreach (Match m in Regex.Matches(html,
@@ -69,7 +57,7 @@ internal static class HtmlUtil
public static string ToPlainText(string html)
{
var s = Regex.Replace(html, "<br\\s*/?>", "\n", RegexOptions.IgnoreCase);
s = Regex.Replace(s, "<[^>]+>", ""); // strip remaining tags
s = Regex.Replace(s, "<[^>]+>", "");
s = WebUtility.HtmlDecode(s);
s = Regex.Replace(s, "[ \\t]+", " ");
return s.Trim();