Real channel fetch (Telegram/Bale/Divar) + AI-audited automation engine + CI/CD

- Fetch: Telegram via t.me/s, Bale via Bot API, Divar via web-search (HttpClient, config-gated, graceful)
- AI layer: DB-backed AppSetting (mode auto/manual, thresholds, AI endpoint/model/key/prompt/framework, auto-approve); OpenAI-compatible IAiAuditor (self-host/Iranian endpoints; fails safe to manual)
- Pipeline: fetch → dedupe(hash) → parse → validate → AI audit → Discard/Flag/Queue/auto-publish (resolve-or-create facility)
- Admin: /Admin/Settings automation+AI panel; queue shows confidence + AI verdict; flagged section
- CI/CD: Dockerfile, docker-compose.prod.yml, .gitea/workflows/ci-cd.yml, nginx vhost, DEPLOY.md; forwarded headers + /healthz + prod reference-only seed; ports 22/80/443 only

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-03 17:41:02 +03:30
parent 931b7b6ffb
commit 36bb165438
18 changed files with 1614 additions and 68 deletions
@@ -0,0 +1,108 @@
using System.Net.Http.Headers;
using System.Text;
using System.Text.Json;
using JobsMedical.Web.Models;
namespace JobsMedical.Web.Services.Scraping;
public record AiStructured(
string? Kind, string? Role, string? City, string? District, string? ShiftType,
string? EmploymentType, long? PayAmount, int? SharePercent, string? Title, string? FacilityName);
/// <summary>An AI verdict on a raw listing.</summary>
public record AiAuditResult(string Decision, int Confidence, string? Reason, AiStructured? Data)
{
public bool Approve => Decision.Equals("approve", StringComparison.OrdinalIgnoreCase);
public bool Reject => Decision.Equals("reject", StringComparison.OrdinalIgnoreCase);
}
public interface IAiAuditor
{
/// <summary>Audit a raw post. Returns null when AI is off or the call fails (fail safe → manual).</summary>
Task<AiAuditResult?> AuditAsync(string rawText, AppSetting settings, CancellationToken ct = default);
}
/// <summary>
/// Calls any OpenAI-compatible chat-completions endpoint (self-hosted vLLM/Ollama, or an Iranian
/// provider — OpenAI/Anthropic are blocked from Iran). The admin-set system prompt is the
/// "framework" that tells the model how to approve/reject/structure. We ask for strict JSON and
/// parse it. Any failure returns null so ingestion falls back to the rule-based path.
/// </summary>
public class OpenAiCompatibleAuditor : IAiAuditor
{
private readonly IHttpClientFactory _http;
private readonly ILogger<OpenAiCompatibleAuditor> _log;
public OpenAiCompatibleAuditor(IHttpClientFactory http, ILogger<OpenAiCompatibleAuditor> log)
{
_http = http;
_log = log;
}
public async Task<AiAuditResult?> AuditAsync(string rawText, AppSetting s, CancellationToken ct = default)
{
if (!s.AiEnabled || string.IsNullOrWhiteSpace(s.AiEndpoint)) return null;
try
{
var payload = new
{
model = string.IsNullOrWhiteSpace(s.AiModel) ? "gpt-4o-mini" : s.AiModel,
temperature = 0,
response_format = new { type = "json_object" },
messages = new object[]
{
new { role = "system", content = s.AiSystemPrompt },
new { role = "user", content = "آگهی خام:\n" + rawText + "\n\nفقط با JSON پاسخ بده." },
},
};
var client = _http.CreateClient("ai");
client.Timeout = TimeSpan.FromSeconds(30);
using var req = new HttpRequestMessage(HttpMethod.Post, s.AiEndpoint)
{
Content = new StringContent(JsonSerializer.Serialize(payload), Encoding.UTF8, "application/json"),
};
if (!string.IsNullOrWhiteSpace(s.AiApiKey))
req.Headers.Authorization = new AuthenticationHeaderValue("Bearer", s.AiApiKey);
using var resp = await client.SendAsync(req, ct);
resp.EnsureSuccessStatusCode();
var body = await resp.Content.ReadAsStringAsync(ct);
using var doc = JsonDocument.Parse(body);
var content = doc.RootElement
.GetProperty("choices")[0].GetProperty("message").GetProperty("content").GetString();
if (string.IsNullOrWhiteSpace(content)) return null;
return ParseVerdict(content);
}
catch (Exception ex)
{
_log.LogWarning(ex, "AI audit failed — falling back to rule-based decision.");
return null;
}
}
private static AiAuditResult? ParseVerdict(string json)
{
// The content itself should be a JSON object; tolerate code fences.
json = json.Trim().Trim('`');
var start = json.IndexOf('{');
var end = json.LastIndexOf('}');
if (start < 0 || end <= start) return null;
json = json.Substring(start, end - start + 1);
using var doc = JsonDocument.Parse(json);
var r = doc.RootElement;
string? S(string k) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.String ? v.GetString() : null;
int I(string k, int d) => r.TryGetProperty(k, out var v) && v.TryGetInt32(out var n) ? n : d;
long? L(string k) => r.TryGetProperty(k, out var v) && v.TryGetInt64(out var n) ? n : null;
int? NI(string k) => r.TryGetProperty(k, out var v) && v.TryGetInt32(out var n) ? n : null;
var decision = (S("decision") ?? "review").ToLowerInvariant();
var data = new AiStructured(S("kind"), S("role"), S("city"), S("district"), S("shiftType"),
S("employmentType"), L("payAmount"), NI("sharePercent"), S("title"), S("facilityName"));
return new AiAuditResult(decision, Math.Clamp(I("confidence", 50), 0, 100), S("reason"), data);
}
}
@@ -0,0 +1,68 @@
using System.Text.Json;
using Microsoft.Extensions.Options;
namespace JobsMedical.Web.Services.Scraping;
public class BaleOptions
{
public bool Enabled { get; set; }
public string? BotToken { get; set; }
public string BaseUrl { get; set; } = "https://tapi.bale.ai"; // Bale Bot API host
}
/// <summary>
/// Bale (Iranian messenger) source via its Telegram-compatible Bot API getUpdates. The bot must
/// be a member/admin of the channels it should read. Pulls text from messages and channel posts.
/// </summary>
public class BaleListingSource : IListingSource
{
private readonly BaleOptions _opts;
private readonly IHttpClientFactory _http;
private readonly ILogger<BaleListingSource> _log;
public BaleListingSource(IOptions<BaleOptions> opts, IHttpClientFactory http,
ILogger<BaleListingSource> log)
{
_opts = opts.Value;
_http = http;
_log = log;
}
public string Name => "بله";
public bool Enabled => _opts.Enabled && !string.IsNullOrWhiteSpace(_opts.BotToken);
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default)
{
if (!Enabled) { _log.LogInformation("Bale source disabled/unconfigured."); return Array.Empty<ScrapedItem>(); }
try
{
var client = _http.CreateClient("scrape");
var url = $"{_opts.BaseUrl.TrimEnd('/')}/bot{_opts.BotToken}/getUpdates";
var body = await client.GetStringAsync(url, ct);
using var doc = JsonDocument.Parse(body);
if (!doc.RootElement.TryGetProperty("result", out var result) || result.ValueKind != JsonValueKind.Array)
return Array.Empty<ScrapedItem>();
var items = new List<ScrapedItem>();
foreach (var update in result.EnumerateArray())
{
var text = TextOf(update, "channel_post") ?? TextOf(update, "message");
if (!string.IsNullOrWhiteSpace(text) && text!.Trim().Length >= 15)
items.Add(new ScrapedItem("بله", text.Trim()));
}
return items;
}
catch (Exception ex)
{
_log.LogWarning(ex, "Bale fetch failed.");
return Array.Empty<ScrapedItem>();
}
}
private static string? TextOf(JsonElement update, string key)
=> update.TryGetProperty(key, out var m)
&& m.TryGetProperty("text", out var t) && t.ValueKind == JsonValueKind.String
? t.GetString() : null;
}
@@ -1,3 +1,5 @@
using System.Text;
using System.Text.Json;
using Microsoft.Extensions.Options;
namespace JobsMedical.Web.Services.Scraping;
@@ -5,38 +7,80 @@ namespace JobsMedical.Web.Services.Scraping;
public class DivarOptions
{
public bool Enabled { get; set; }
public string? City { get; set; } // e.g. "tehran"
public List<string> Queries { get; set; } = new(); // search terms, e.g. "استخدام پزشک"
public string City { get; set; } = "tehran";
public string Category { get; set; } = "jobs";
public List<string> Queries { get; set; } = new(); // e.g. "پرستار", "پزشک عمومی", "درمانگاه"
public string BaseUrl { get; set; } = "https://api.divar.ir/v8/web-search";
public int PerQuery { get; set; } = 25;
}
/// <summary>
/// Divar source. Credential-ready: configure city + queries in (Ingestion:Divar) and implement
/// the fetch against Divar's listing API/HTML. Dormant until enabled.
/// Best-effort Divar fetch: queries Divar's web-search JSON for each term and harvests post
/// titles + descriptions. Divar's private API shifts shape over time, so we walk the JSON
/// tolerantly for any object carrying a "title" plus a nearby description field, and fail soft.
/// </summary>
public class DivarListingSource : IListingSource
{
private readonly DivarOptions _opts;
private readonly IHttpClientFactory _http;
private readonly ILogger<DivarListingSource> _log;
public DivarListingSource(IOptions<DivarOptions> opts, ILogger<DivarListingSource> log)
public DivarListingSource(IOptions<DivarOptions> opts, IHttpClientFactory http,
ILogger<DivarListingSource> log)
{
_opts = opts.Value;
_http = http;
_log = log;
}
public string Name => "دیوار";
public bool Enabled => _opts.Enabled && _opts.Queries.Count > 0;
public Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default)
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default)
{
if (!Enabled)
if (!Enabled) { _log.LogInformation("Divar source disabled/unconfigured."); return Array.Empty<ScrapedItem>(); }
var client = _http.CreateClient("scrape");
var items = new List<ScrapedItem>();
foreach (var q in _opts.Queries.Where(q => q.Trim().Length > 0))
{
_log.LogInformation("Divar source not configured — skipping.");
return Task.FromResult<IReadOnlyList<ScrapedItem>>(Array.Empty<ScrapedItem>());
try
{
var url = $"{_opts.BaseUrl.TrimEnd('/')}/{_opts.City}/{_opts.Category}?q={Uri.EscapeDataString(q)}";
var body = await client.GetStringAsync(url, ct);
using var doc = JsonDocument.Parse(body);
foreach (var text in Harvest(doc.RootElement).Take(_opts.PerQuery))
items.Add(new ScrapedItem("دیوار", text, "https://divar.ir"));
}
catch (Exception ex) { _log.LogWarning(ex, "Divar fetch failed for query {Query}", q); }
}
return items;
}
private static readonly string[] DescKeys =
{ "description", "middle_description_text", "subtitle", "bottom_description_text", "normal_text" };
/// <summary>Walk the JSON; for each object with a string "title", emit title + first description.</summary>
private static IEnumerable<string> Harvest(JsonElement el)
{
if (el.ValueKind == JsonValueKind.Object)
{
if (el.TryGetProperty("title", out var t) && t.ValueKind == JsonValueKind.String)
{
var sb = new StringBuilder(t.GetString());
foreach (var k in DescKeys)
if (el.TryGetProperty(k, out var d) && d.ValueKind == JsonValueKind.String)
{ sb.Append(" — ").Append(d.GetString()); break; }
var text = sb.ToString().Trim();
if (text.Length >= 15) yield return text;
}
foreach (var p in el.EnumerateObject())
foreach (var s in Harvest(p.Value)) yield return s;
}
else if (el.ValueKind == JsonValueKind.Array)
{
foreach (var item in el.EnumerateArray())
foreach (var s in Harvest(item)) yield return s;
}
// TODO(prod): query Divar for each term in the configured city, map each ad's
// title+description to new ScrapedItem(Name, text, adUrl).
_log.LogWarning("Divar fetch not yet implemented; returning empty.");
return Task.FromResult<IReadOnlyList<ScrapedItem>>(Array.Empty<ScrapedItem>());
}
}
@@ -7,22 +7,24 @@ using Microsoft.EntityFrameworkCore;
namespace JobsMedical.Web.Services.Scraping;
public record SourceResult(string Source, int Fetched, int Queued, int Flagged, int Spam, int Duplicates);
public record SourceResult(string Source, int Fetched, int Queued, int Published, int Flagged, int Spam, int Duplicates);
public record IngestionSummary(List<SourceResult> Sources)
{
public int TotalQueued => Sources.Sum(s => s.Queued);
public int TotalPublished => Sources.Sum(s => s.Published);
public int TotalFlagged => Sources.Sum(s => s.Flagged);
public int TotalSpam => Sources.Sum(s => s.Spam);
public int TotalDuplicates => Sources.Sum(s => s.Duplicates);
}
/// <summary>
/// The scrape engine. Pulls from every enabled <see cref="IListingSource"/>, dedupes by content
/// hash, parses with <see cref="IListingParser"/>, validates with <see cref="ListingValidator"/>,
/// and stores each as a <see cref="RawListing"/> with a status: New (queued for review),
/// Flagged (incomplete/suspicious), or Discarded (spam). Source-agnostic — add a source and it
/// flows through unchanged.
/// The scrape engine. For every enabled source: dedupe by content hash → parse → rule-validate →
/// (optional) AI audit → decide. Decision depends on admin settings:
/// • spam → Discarded
/// • AI on: AI verdict drives approve/reject/review; approve + Automatic + AiAutoApprove → publish
/// • AI off: Automatic + confidence ≥ threshold → publish; else queue/flag
/// "Publish" resolves-or-creates an (unverified) facility and creates the Shift/JobOpening.
/// </summary>
public class IngestionService
{
@@ -30,16 +32,15 @@ public class IngestionService
private readonly IEnumerable<IListingSource> _sources;
private readonly IListingParser _parser;
private readonly ListingValidator _validator;
private readonly IAiAuditor _ai;
private readonly SettingsService _settings;
private readonly ILogger<IngestionService> _log;
public IngestionService(AppDbContext db, IEnumerable<IListingSource> sources,
IListingParser parser, ListingValidator validator, ILogger<IngestionService> log)
public IngestionService(AppDbContext db, IEnumerable<IListingSource> sources, IListingParser parser,
ListingValidator validator, IAiAuditor ai, SettingsService settings, ILogger<IngestionService> log)
{
_db = db;
_sources = sources;
_parser = parser;
_validator = validator;
_log = log;
_db = db; _sources = sources; _parser = parser; _validator = validator;
_ai = ai; _settings = settings; _log = log;
}
public IReadOnlyList<(string Name, bool Enabled)> Sources =>
@@ -47,18 +48,22 @@ public class IngestionService
public async Task<IngestionSummary> RunAsync(CancellationToken ct = default)
{
var roles = await _db.Roles.Select(r => r.Name).ToListAsync(ct);
var cities = await _db.Cities.Select(c => c.Name).ToListAsync(ct);
var districts = await _db.Districts.Select(d => d.Name).ToListAsync(ct);
var settings = await _settings.GetAsync();
var roles = await _db.Roles.ToListAsync(ct);
var cities = await _db.Cities.ToListAsync(ct);
var districts = await _db.Districts.ToListAsync(ct);
var roleNames = roles.Select(r => r.Name).ToList();
var cityNames = cities.Select(c => c.Name).ToList();
var districtNames = districts.Select(d => d.Name).ToList();
var results = new List<SourceResult>();
foreach (var source in _sources.Where(s => s.Enabled))
{
int fetched = 0, queued = 0, flagged = 0, spam = 0, dupes = 0;
int fetched = 0, queued = 0, published = 0, flagged = 0, spam = 0, dupes = 0;
IReadOnlyList<ScrapedItem> items;
try { items = await source.FetchAsync(ct); }
catch (Exception ex) { _log.LogError(ex, "Source {Source} fetch failed", source.Name); continue; }
catch (Exception ex) { _log.LogError(ex, "Source {Source} failed", source.Name); continue; }
foreach (var item in items)
{
@@ -66,42 +71,155 @@ public class IngestionService
var hash = Hash(item.RawText);
if (await _db.RawListings.AnyAsync(r => r.ContentHash == hash, ct)) { dupes++; continue; }
var parsed = _parser.Parse(item.RawText, roles, cities, districts);
var parsed = _parser.Parse(item.RawText, roleNames, cityNames, districtNames);
var val = _validator.Validate(item.RawText, parsed);
var status = val.IsSpam ? RawListingStatus.Discarded
: val.IsValid ? RawListingStatus.New
: RawListingStatus.Flagged;
if (status == RawListingStatus.New) queued++;
else if (status == RawListingStatus.Flagged) flagged++;
else spam++;
AiAuditResult? ai = null;
if (settings.AiEnabled && !val.IsSpam)
ai = await _ai.AuditAsync(item.RawText, settings, ct);
_db.RawListings.Add(new RawListing
var (status, reason, confidence) = Decide(settings, val, ai);
var raw = new RawListing
{
SourceChannel = item.Source,
SourceUrl = item.SourceUrl,
RawText = item.RawText.Trim(),
ContentHash = hash,
Confidence = val.Confidence,
ValidationNotes = val.Issues.Count > 0 ? string.Join("؛ ", val.Issues) : null,
Confidence = confidence,
ValidationNotes = reason,
Status = status,
});
};
_db.RawListings.Add(raw);
if (status == RawListingStatus.Normalized)
{
try { Publish(parsed, ai, raw, roles, cities, districts); published++; }
catch (Exception ex) { _log.LogWarning(ex, "Auto-publish failed; queueing instead"); raw.Status = RawListingStatus.New; queued++; }
}
else if (status == RawListingStatus.New) queued++;
else if (status == RawListingStatus.Flagged) flagged++;
else spam++;
}
await _db.SaveChangesAsync(ct);
results.Add(new SourceResult(source.Name, fetched, queued, flagged, spam, dupes));
_log.LogInformation("Ingestion {Source}: fetched={F} queued={Q} flagged={Fl} spam={S} dupes={D}",
source.Name, fetched, queued, flagged, spam, dupes);
results.Add(new SourceResult(source.Name, fetched, queued, published, flagged, spam, dupes));
_log.LogInformation("Ingest {S}: fetched={F} queued={Q} published={P} flagged={Fl} spam={Sp} dupes={D}",
source.Name, fetched, queued, published, flagged, spam, dupes);
}
return new IngestionSummary(results);
}
/// <summary>SHA-256 hex of the whitespace-normalized text (for cross-run dedupe).</summary>
private static (RawListingStatus status, string? reason, int confidence) Decide(
AppSetting s, ValidationResult val, AiAuditResult? ai)
{
var notes = val.Issues.Count > 0 ? string.Join("؛ ", val.Issues) : null;
if (val.IsSpam)
return (RawListingStatus.Discarded, Join("اسپم", notes), val.Confidence);
if (ai is not null)
{
var aiNote = Join($"AI: {ai.Decision} ({ai.Confidence}٪)" + (ai.Reason is null ? "" : $" — {ai.Reason}"), notes);
if (ai.Reject) return (RawListingStatus.Discarded, aiNote, ai.Confidence);
if (ai.Approve)
return (s.Mode == IngestionMode.Automatic && s.AiAutoApprove
? RawListingStatus.Normalized : RawListingStatus.New, aiNote, ai.Confidence);
return (RawListingStatus.Flagged, aiNote, ai.Confidence); // review
}
if (!val.IsValid) return (RawListingStatus.Flagged, notes, val.Confidence);
if (s.Mode == IngestionMode.Automatic && val.Confidence >= s.AutoPublishMinConfidence)
return (RawListingStatus.Normalized, notes, val.Confidence);
return (RawListingStatus.New, notes, val.Confidence);
}
private void Publish(ParsedListing parsed, AiAuditResult? ai, RawListing raw,
List<Role> roles, List<City> cities, List<District> districts)
{
var d = ai?.Data;
var roleName = d?.Role ?? parsed.RoleName;
var cityName = d?.City ?? parsed.CityName;
var districtName = d?.District ?? parsed.DistrictName;
var role = roles.FirstOrDefault(r => r.Name == roleName) ?? roles.First();
var city = cities.FirstOrDefault(c => c.Name == cityName)
?? cities.FirstOrDefault(c => c.IsActive) ?? cities.First();
var district = districts.FirstOrDefault(x => x.Name == districtName && x.CityId == city.Id);
var facilityName = !string.IsNullOrWhiteSpace(d?.FacilityName) ? d!.FacilityName!.Trim()
: $"مرکز درمانی (از {raw.SourceChannel})";
var facility = _db.Facilities.Local.FirstOrDefault(f => f.Name == facilityName && f.CityId == city.Id)
?? _db.Facilities.FirstOrDefault(f => f.Name == facilityName && f.CityId == city.Id);
if (facility is null)
{
facility = new Facility
{
Name = facilityName, Type = FacilityType.Clinic, City = city, DistrictId = district?.Id,
Phone = parsed.Phone, IsVerified = false,
};
_db.Facilities.Add(facility);
}
var kind = (d?.Kind ?? parsed.Kind.ToString()).ToLowerInvariant();
if (kind.Contains("job") || kind.Contains("استخدام"))
{
_db.JobOpenings.Add(new JobOpening
{
Facility = facility, Role = role,
Title = !string.IsNullOrWhiteSpace(d?.Title) ? d!.Title!.Trim() : $"استخدام {role.Name}",
EmploymentType = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
SalaryMin = parsed.PayAmount,
Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated,
SourceUrl = raw.SourceUrl,
});
}
else
{
var st = MapShiftType(d?.ShiftType, parsed.ShiftType);
var (start, end) = DefaultTimes(st);
_db.Shifts.Add(new Shift
{
Facility = facility, Role = role,
Date = DateOnly.FromDateTime(DateTime.UtcNow).AddDays(1),
StartTime = start, EndTime = end, ShiftType = st,
SpecialtyRequired = role.Name, Description = raw.RawText,
PayType = parsed.SharePercent is not null && parsed.PayAmount is null ? PayType.Percentage
: parsed.PayAmount is null ? PayType.Negotiable : PayType.PerShift,
PayAmount = parsed.PayAmount, SharePercent = parsed.SharePercent,
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
});
}
raw.Status = RawListingStatus.Normalized;
}
private static ShiftType MapShiftType(string? ai, ShiftType? parsed) => (ai?.ToLowerInvariant()) switch
{
"day" => ShiftType.Day, "evening" => ShiftType.Evening, "night" => ShiftType.Night, "oncall" => ShiftType.OnCall,
_ => parsed ?? ShiftType.Day,
};
private static EmploymentType MapEmployment(string? ai, EmploymentType? parsed) => (ai?.ToLowerInvariant()) switch
{
"parttime" => EmploymentType.PartTime, "contract" => EmploymentType.Contract,
"plan" => EmploymentType.Plan, "fulltime" => EmploymentType.FullTime,
_ => parsed ?? EmploymentType.FullTime,
};
private static (TimeOnly, TimeOnly) DefaultTimes(ShiftType t) => t switch
{
ShiftType.Day => (new TimeOnly(8, 0), new TimeOnly(14, 0)),
ShiftType.Evening => (new TimeOnly(14, 0), new TimeOnly(20, 0)),
ShiftType.Night => (new TimeOnly(20, 0), new TimeOnly(8, 0)),
_ => (new TimeOnly(8, 0), new TimeOnly(8, 0)),
};
private static string? Join(string a, string? b) => string.IsNullOrEmpty(b) ? a : $"{a} | {b}";
private static string Hash(string text)
{
var normalized = Regex.Replace((text ?? "").Trim(), @"\s+", " ");
var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(normalized));
return Convert.ToHexString(bytes).ToLowerInvariant();
return Convert.ToHexString(SHA256.HashData(Encoding.UTF8.GetBytes(normalized))).ToLowerInvariant();
}
}
@@ -0,0 +1,40 @@
using JobsMedical.Web.Data;
using JobsMedical.Web.Models;
using Microsoft.EntityFrameworkCore;
namespace JobsMedical.Web.Services.Scraping;
/// <summary>Loads/creates the single platform-settings row (Id=1).</summary>
public class SettingsService
{
private readonly AppDbContext _db;
public SettingsService(AppDbContext db) => _db = db;
public async Task<AppSetting> GetAsync()
{
var s = await _db.AppSettings.FirstOrDefaultAsync(x => x.Id == 1);
if (s is null)
{
s = new AppSetting { Id = 1 };
_db.AppSettings.Add(s);
await _db.SaveChangesAsync();
}
return s;
}
public async Task SaveAsync(AppSetting incoming)
{
var s = await GetAsync();
s.Mode = incoming.Mode;
s.AutoPublishMinConfidence = Math.Clamp(incoming.AutoPublishMinConfidence, 0, 100);
s.AiEnabled = incoming.AiEnabled;
s.AiEndpoint = incoming.AiEndpoint?.Trim();
s.AiApiKey = incoming.AiApiKey?.Trim();
s.AiModel = incoming.AiModel?.Trim();
s.AiSystemPrompt = string.IsNullOrWhiteSpace(incoming.AiSystemPrompt)
? AppSetting.DefaultPrompt : incoming.AiSystemPrompt;
s.AiAutoApprove = incoming.AiAutoApprove;
s.UpdatedAt = DateTime.UtcNow;
await _db.SaveChangesAsync();
}
}
@@ -1,3 +1,5 @@
using System.Net;
using System.Text.RegularExpressions;
using Microsoft.Extensions.Options;
namespace JobsMedical.Web.Services.Scraping;
@@ -5,40 +7,71 @@ namespace JobsMedical.Web.Services.Scraping;
public class TelegramOptions
{
public bool Enabled { get; set; }
public string? BotToken { get; set; }
public List<string> Channels { get; set; } = new(); // @channel handles to read
public string? BotToken { get; set; } // optional (for private channels later)
public List<string> Channels { get; set; } = new(); // public channel usernames (no @)
public int PerChannel { get; set; } = 20;
}
/// <summary>
/// Telegram/Bale channel source. Credential-ready: wire a bot token + channel list in config
/// (Ingestion:Telegram) and implement the fetch against the Bot API (getUpdates / channel posts)
/// or a userbot. Dormant until enabled, so the engine runs without it.
/// Reads public Telegram channels via the web preview (https://t.me/s/&lt;channel&gt;) — no bot
/// token or login needed for public channels. Each message's text becomes a ScrapedItem.
/// </summary>
public class TelegramListingSource : IListingSource
{
private readonly TelegramOptions _opts;
private readonly IHttpClientFactory _http;
private readonly ILogger<TelegramListingSource> _log;
public TelegramListingSource(IOptions<TelegramOptions> opts, ILogger<TelegramListingSource> log)
public TelegramListingSource(IOptions<TelegramOptions> opts, IHttpClientFactory http,
ILogger<TelegramListingSource> log)
{
_opts = opts.Value;
_http = http;
_log = log;
}
public string Name => "تلگرام/بله";
public bool Enabled => _opts.Enabled && !string.IsNullOrWhiteSpace(_opts.BotToken) && _opts.Channels.Count > 0;
public string Name => "تلگرام";
public bool Enabled => _opts.Enabled && _opts.Channels.Count > 0;
public Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default)
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default)
{
if (!Enabled)
if (!Enabled) { _log.LogInformation("Telegram source disabled/unconfigured."); return Array.Empty<ScrapedItem>(); }
var client = _http.CreateClient("scrape");
var items = new List<ScrapedItem>();
foreach (var ch in _opts.Channels.Select(c => c.TrimStart('@')).Where(c => c.Length > 0))
{
_log.LogInformation("Telegram source not configured — skipping.");
return Task.FromResult<IReadOnlyList<ScrapedItem>>(Array.Empty<ScrapedItem>());
try
{
var html = await client.GetStringAsync($"https://t.me/s/{ch}", ct);
foreach (var text in ExtractMessages(html).Take(_opts.PerChannel))
items.Add(new ScrapedItem($"تلگرام/{ch}", text, $"https://t.me/{ch}"));
}
catch (Exception ex) { _log.LogWarning(ex, "Telegram fetch failed for {Channel}", ch); }
}
return items;
}
// Message bodies live in <div class="tgme_widget_message_text ...">...</div>.
private static IEnumerable<string> ExtractMessages(string html)
{
foreach (Match m in Regex.Matches(html,
"<div class=\"tgme_widget_message_text[^\"]*\"[^>]*>(.*?)</div>", RegexOptions.Singleline))
{
var text = HtmlUtil.ToPlainText(m.Groups[1].Value);
if (text.Length >= 15) yield return text;
}
// TODO(prod): call https://api.telegram.org/bot{token}/getUpdates (or channel history),
// map each message to new ScrapedItem(Name, message.Text, messageLink). The validation +
// dedupe pipeline downstream is already source-agnostic.
_log.LogWarning("Telegram fetch not yet implemented; returning empty.");
return Task.FromResult<IReadOnlyList<ScrapedItem>>(Array.Empty<ScrapedItem>());
}
}
internal static class HtmlUtil
{
public static string ToPlainText(string html)
{
var s = Regex.Replace(html, "<br\\s*/?>", "\n", RegexOptions.IgnoreCase);
s = Regex.Replace(s, "<[^>]+>", ""); // strip remaining tags
s = WebUtility.HtmlDecode(s);
s = Regex.Replace(s, "[ \\t]+", " ");
return s.Trim();
}
}