Real channel fetch (Telegram/Bale/Divar) + AI-audited automation engine + CI/CD
- Fetch: Telegram via t.me/s, Bale via Bot API, Divar via web-search (HttpClient, config-gated, graceful) - AI layer: DB-backed AppSetting (mode auto/manual, thresholds, AI endpoint/model/key/prompt/framework, auto-approve); OpenAI-compatible IAiAuditor (self-host/Iranian endpoints; fails safe to manual) - Pipeline: fetch → dedupe(hash) → parse → validate → AI audit → Discard/Flag/Queue/auto-publish (resolve-or-create facility) - Admin: /Admin/Settings automation+AI panel; queue shows confidence + AI verdict; flagged section - CI/CD: Dockerfile, docker-compose.prod.yml, .gitea/workflows/ci-cd.yml, nginx vhost, DEPLOY.md; forwarded headers + /healthz + prod reference-only seed; ports 22/80/443 only Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,108 @@
|
||||
using System.Net.Http.Headers;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using JobsMedical.Web.Models;
|
||||
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
|
||||
public record AiStructured(
|
||||
string? Kind, string? Role, string? City, string? District, string? ShiftType,
|
||||
string? EmploymentType, long? PayAmount, int? SharePercent, string? Title, string? FacilityName);
|
||||
|
||||
/// <summary>An AI verdict on a raw listing.</summary>
|
||||
public record AiAuditResult(string Decision, int Confidence, string? Reason, AiStructured? Data)
|
||||
{
|
||||
public bool Approve => Decision.Equals("approve", StringComparison.OrdinalIgnoreCase);
|
||||
public bool Reject => Decision.Equals("reject", StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
public interface IAiAuditor
|
||||
{
|
||||
/// <summary>Audit a raw post. Returns null when AI is off or the call fails (fail safe → manual).</summary>
|
||||
Task<AiAuditResult?> AuditAsync(string rawText, AppSetting settings, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Calls any OpenAI-compatible chat-completions endpoint (self-hosted vLLM/Ollama, or an Iranian
|
||||
/// provider — OpenAI/Anthropic are blocked from Iran). The admin-set system prompt is the
|
||||
/// "framework" that tells the model how to approve/reject/structure. We ask for strict JSON and
|
||||
/// parse it. Any failure returns null so ingestion falls back to the rule-based path.
|
||||
/// </summary>
|
||||
public class OpenAiCompatibleAuditor : IAiAuditor
|
||||
{
|
||||
private readonly IHttpClientFactory _http;
|
||||
private readonly ILogger<OpenAiCompatibleAuditor> _log;
|
||||
|
||||
public OpenAiCompatibleAuditor(IHttpClientFactory http, ILogger<OpenAiCompatibleAuditor> log)
|
||||
{
|
||||
_http = http;
|
||||
_log = log;
|
||||
}
|
||||
|
||||
public async Task<AiAuditResult?> AuditAsync(string rawText, AppSetting s, CancellationToken ct = default)
|
||||
{
|
||||
if (!s.AiEnabled || string.IsNullOrWhiteSpace(s.AiEndpoint)) return null;
|
||||
|
||||
try
|
||||
{
|
||||
var payload = new
|
||||
{
|
||||
model = string.IsNullOrWhiteSpace(s.AiModel) ? "gpt-4o-mini" : s.AiModel,
|
||||
temperature = 0,
|
||||
response_format = new { type = "json_object" },
|
||||
messages = new object[]
|
||||
{
|
||||
new { role = "system", content = s.AiSystemPrompt },
|
||||
new { role = "user", content = "آگهی خام:\n" + rawText + "\n\nفقط با JSON پاسخ بده." },
|
||||
},
|
||||
};
|
||||
|
||||
var client = _http.CreateClient("ai");
|
||||
client.Timeout = TimeSpan.FromSeconds(30);
|
||||
using var req = new HttpRequestMessage(HttpMethod.Post, s.AiEndpoint)
|
||||
{
|
||||
Content = new StringContent(JsonSerializer.Serialize(payload), Encoding.UTF8, "application/json"),
|
||||
};
|
||||
if (!string.IsNullOrWhiteSpace(s.AiApiKey))
|
||||
req.Headers.Authorization = new AuthenticationHeaderValue("Bearer", s.AiApiKey);
|
||||
|
||||
using var resp = await client.SendAsync(req, ct);
|
||||
resp.EnsureSuccessStatusCode();
|
||||
var body = await resp.Content.ReadAsStringAsync(ct);
|
||||
|
||||
using var doc = JsonDocument.Parse(body);
|
||||
var content = doc.RootElement
|
||||
.GetProperty("choices")[0].GetProperty("message").GetProperty("content").GetString();
|
||||
if (string.IsNullOrWhiteSpace(content)) return null;
|
||||
|
||||
return ParseVerdict(content);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_log.LogWarning(ex, "AI audit failed — falling back to rule-based decision.");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static AiAuditResult? ParseVerdict(string json)
|
||||
{
|
||||
// The content itself should be a JSON object; tolerate code fences.
|
||||
json = json.Trim().Trim('`');
|
||||
var start = json.IndexOf('{');
|
||||
var end = json.LastIndexOf('}');
|
||||
if (start < 0 || end <= start) return null;
|
||||
json = json.Substring(start, end - start + 1);
|
||||
|
||||
using var doc = JsonDocument.Parse(json);
|
||||
var r = doc.RootElement;
|
||||
string? S(string k) => r.TryGetProperty(k, out var v) && v.ValueKind == JsonValueKind.String ? v.GetString() : null;
|
||||
int I(string k, int d) => r.TryGetProperty(k, out var v) && v.TryGetInt32(out var n) ? n : d;
|
||||
long? L(string k) => r.TryGetProperty(k, out var v) && v.TryGetInt64(out var n) ? n : null;
|
||||
int? NI(string k) => r.TryGetProperty(k, out var v) && v.TryGetInt32(out var n) ? n : null;
|
||||
|
||||
var decision = (S("decision") ?? "review").ToLowerInvariant();
|
||||
var data = new AiStructured(S("kind"), S("role"), S("city"), S("district"), S("shiftType"),
|
||||
S("employmentType"), L("payAmount"), NI("sharePercent"), S("title"), S("facilityName"));
|
||||
return new AiAuditResult(decision, Math.Clamp(I("confidence", 50), 0, 100), S("reason"), data);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
|
||||
public class BaleOptions
|
||||
{
|
||||
public bool Enabled { get; set; }
|
||||
public string? BotToken { get; set; }
|
||||
public string BaseUrl { get; set; } = "https://tapi.bale.ai"; // Bale Bot API host
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Bale (Iranian messenger) source via its Telegram-compatible Bot API getUpdates. The bot must
|
||||
/// be a member/admin of the channels it should read. Pulls text from messages and channel posts.
|
||||
/// </summary>
|
||||
public class BaleListingSource : IListingSource
|
||||
{
|
||||
private readonly BaleOptions _opts;
|
||||
private readonly IHttpClientFactory _http;
|
||||
private readonly ILogger<BaleListingSource> _log;
|
||||
|
||||
public BaleListingSource(IOptions<BaleOptions> opts, IHttpClientFactory http,
|
||||
ILogger<BaleListingSource> log)
|
||||
{
|
||||
_opts = opts.Value;
|
||||
_http = http;
|
||||
_log = log;
|
||||
}
|
||||
|
||||
public string Name => "بله";
|
||||
public bool Enabled => _opts.Enabled && !string.IsNullOrWhiteSpace(_opts.BotToken);
|
||||
|
||||
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default)
|
||||
{
|
||||
if (!Enabled) { _log.LogInformation("Bale source disabled/unconfigured."); return Array.Empty<ScrapedItem>(); }
|
||||
|
||||
try
|
||||
{
|
||||
var client = _http.CreateClient("scrape");
|
||||
var url = $"{_opts.BaseUrl.TrimEnd('/')}/bot{_opts.BotToken}/getUpdates";
|
||||
var body = await client.GetStringAsync(url, ct);
|
||||
|
||||
using var doc = JsonDocument.Parse(body);
|
||||
if (!doc.RootElement.TryGetProperty("result", out var result) || result.ValueKind != JsonValueKind.Array)
|
||||
return Array.Empty<ScrapedItem>();
|
||||
|
||||
var items = new List<ScrapedItem>();
|
||||
foreach (var update in result.EnumerateArray())
|
||||
{
|
||||
var text = TextOf(update, "channel_post") ?? TextOf(update, "message");
|
||||
if (!string.IsNullOrWhiteSpace(text) && text!.Trim().Length >= 15)
|
||||
items.Add(new ScrapedItem("بله", text.Trim()));
|
||||
}
|
||||
return items;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_log.LogWarning(ex, "Bale fetch failed.");
|
||||
return Array.Empty<ScrapedItem>();
|
||||
}
|
||||
}
|
||||
|
||||
private static string? TextOf(JsonElement update, string key)
|
||||
=> update.TryGetProperty(key, out var m)
|
||||
&& m.TryGetProperty("text", out var t) && t.ValueKind == JsonValueKind.String
|
||||
? t.GetString() : null;
|
||||
}
|
||||
@@ -1,3 +1,5 @@
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
@@ -5,38 +7,80 @@ namespace JobsMedical.Web.Services.Scraping;
|
||||
public class DivarOptions
|
||||
{
|
||||
public bool Enabled { get; set; }
|
||||
public string? City { get; set; } // e.g. "tehran"
|
||||
public List<string> Queries { get; set; } = new(); // search terms, e.g. "استخدام پزشک"
|
||||
public string City { get; set; } = "tehran";
|
||||
public string Category { get; set; } = "jobs";
|
||||
public List<string> Queries { get; set; } = new(); // e.g. "پرستار", "پزشک عمومی", "درمانگاه"
|
||||
public string BaseUrl { get; set; } = "https://api.divar.ir/v8/web-search";
|
||||
public int PerQuery { get; set; } = 25;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Divar source. Credential-ready: configure city + queries in (Ingestion:Divar) and implement
|
||||
/// the fetch against Divar's listing API/HTML. Dormant until enabled.
|
||||
/// Best-effort Divar fetch: queries Divar's web-search JSON for each term and harvests post
|
||||
/// titles + descriptions. Divar's private API shifts shape over time, so we walk the JSON
|
||||
/// tolerantly for any object carrying a "title" plus a nearby description field, and fail soft.
|
||||
/// </summary>
|
||||
public class DivarListingSource : IListingSource
|
||||
{
|
||||
private readonly DivarOptions _opts;
|
||||
private readonly IHttpClientFactory _http;
|
||||
private readonly ILogger<DivarListingSource> _log;
|
||||
|
||||
public DivarListingSource(IOptions<DivarOptions> opts, ILogger<DivarListingSource> log)
|
||||
public DivarListingSource(IOptions<DivarOptions> opts, IHttpClientFactory http,
|
||||
ILogger<DivarListingSource> log)
|
||||
{
|
||||
_opts = opts.Value;
|
||||
_http = http;
|
||||
_log = log;
|
||||
}
|
||||
|
||||
public string Name => "دیوار";
|
||||
public bool Enabled => _opts.Enabled && _opts.Queries.Count > 0;
|
||||
|
||||
public Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default)
|
||||
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default)
|
||||
{
|
||||
if (!Enabled)
|
||||
if (!Enabled) { _log.LogInformation("Divar source disabled/unconfigured."); return Array.Empty<ScrapedItem>(); }
|
||||
|
||||
var client = _http.CreateClient("scrape");
|
||||
var items = new List<ScrapedItem>();
|
||||
foreach (var q in _opts.Queries.Where(q => q.Trim().Length > 0))
|
||||
{
|
||||
_log.LogInformation("Divar source not configured — skipping.");
|
||||
return Task.FromResult<IReadOnlyList<ScrapedItem>>(Array.Empty<ScrapedItem>());
|
||||
try
|
||||
{
|
||||
var url = $"{_opts.BaseUrl.TrimEnd('/')}/{_opts.City}/{_opts.Category}?q={Uri.EscapeDataString(q)}";
|
||||
var body = await client.GetStringAsync(url, ct);
|
||||
using var doc = JsonDocument.Parse(body);
|
||||
foreach (var text in Harvest(doc.RootElement).Take(_opts.PerQuery))
|
||||
items.Add(new ScrapedItem("دیوار", text, "https://divar.ir"));
|
||||
}
|
||||
catch (Exception ex) { _log.LogWarning(ex, "Divar fetch failed for query {Query}", q); }
|
||||
}
|
||||
return items;
|
||||
}
|
||||
|
||||
private static readonly string[] DescKeys =
|
||||
{ "description", "middle_description_text", "subtitle", "bottom_description_text", "normal_text" };
|
||||
|
||||
/// <summary>Walk the JSON; for each object with a string "title", emit title + first description.</summary>
|
||||
private static IEnumerable<string> Harvest(JsonElement el)
|
||||
{
|
||||
if (el.ValueKind == JsonValueKind.Object)
|
||||
{
|
||||
if (el.TryGetProperty("title", out var t) && t.ValueKind == JsonValueKind.String)
|
||||
{
|
||||
var sb = new StringBuilder(t.GetString());
|
||||
foreach (var k in DescKeys)
|
||||
if (el.TryGetProperty(k, out var d) && d.ValueKind == JsonValueKind.String)
|
||||
{ sb.Append(" — ").Append(d.GetString()); break; }
|
||||
var text = sb.ToString().Trim();
|
||||
if (text.Length >= 15) yield return text;
|
||||
}
|
||||
foreach (var p in el.EnumerateObject())
|
||||
foreach (var s in Harvest(p.Value)) yield return s;
|
||||
}
|
||||
else if (el.ValueKind == JsonValueKind.Array)
|
||||
{
|
||||
foreach (var item in el.EnumerateArray())
|
||||
foreach (var s in Harvest(item)) yield return s;
|
||||
}
|
||||
// TODO(prod): query Divar for each term in the configured city, map each ad's
|
||||
// title+description to new ScrapedItem(Name, text, adUrl).
|
||||
_log.LogWarning("Divar fetch not yet implemented; returning empty.");
|
||||
return Task.FromResult<IReadOnlyList<ScrapedItem>>(Array.Empty<ScrapedItem>());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,22 +7,24 @@ using Microsoft.EntityFrameworkCore;
|
||||
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
|
||||
public record SourceResult(string Source, int Fetched, int Queued, int Flagged, int Spam, int Duplicates);
|
||||
public record SourceResult(string Source, int Fetched, int Queued, int Published, int Flagged, int Spam, int Duplicates);
|
||||
|
||||
public record IngestionSummary(List<SourceResult> Sources)
|
||||
{
|
||||
public int TotalQueued => Sources.Sum(s => s.Queued);
|
||||
public int TotalPublished => Sources.Sum(s => s.Published);
|
||||
public int TotalFlagged => Sources.Sum(s => s.Flagged);
|
||||
public int TotalSpam => Sources.Sum(s => s.Spam);
|
||||
public int TotalDuplicates => Sources.Sum(s => s.Duplicates);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The scrape engine. Pulls from every enabled <see cref="IListingSource"/>, dedupes by content
|
||||
/// hash, parses with <see cref="IListingParser"/>, validates with <see cref="ListingValidator"/>,
|
||||
/// and stores each as a <see cref="RawListing"/> with a status: New (queued for review),
|
||||
/// Flagged (incomplete/suspicious), or Discarded (spam). Source-agnostic — add a source and it
|
||||
/// flows through unchanged.
|
||||
/// The scrape engine. For every enabled source: dedupe by content hash → parse → rule-validate →
|
||||
/// (optional) AI audit → decide. Decision depends on admin settings:
|
||||
/// • spam → Discarded
|
||||
/// • AI on: AI verdict drives approve/reject/review; approve + Automatic + AiAutoApprove → publish
|
||||
/// • AI off: Automatic + confidence ≥ threshold → publish; else queue/flag
|
||||
/// "Publish" resolves-or-creates an (unverified) facility and creates the Shift/JobOpening.
|
||||
/// </summary>
|
||||
public class IngestionService
|
||||
{
|
||||
@@ -30,16 +32,15 @@ public class IngestionService
|
||||
private readonly IEnumerable<IListingSource> _sources;
|
||||
private readonly IListingParser _parser;
|
||||
private readonly ListingValidator _validator;
|
||||
private readonly IAiAuditor _ai;
|
||||
private readonly SettingsService _settings;
|
||||
private readonly ILogger<IngestionService> _log;
|
||||
|
||||
public IngestionService(AppDbContext db, IEnumerable<IListingSource> sources,
|
||||
IListingParser parser, ListingValidator validator, ILogger<IngestionService> log)
|
||||
public IngestionService(AppDbContext db, IEnumerable<IListingSource> sources, IListingParser parser,
|
||||
ListingValidator validator, IAiAuditor ai, SettingsService settings, ILogger<IngestionService> log)
|
||||
{
|
||||
_db = db;
|
||||
_sources = sources;
|
||||
_parser = parser;
|
||||
_validator = validator;
|
||||
_log = log;
|
||||
_db = db; _sources = sources; _parser = parser; _validator = validator;
|
||||
_ai = ai; _settings = settings; _log = log;
|
||||
}
|
||||
|
||||
public IReadOnlyList<(string Name, bool Enabled)> Sources =>
|
||||
@@ -47,18 +48,22 @@ public class IngestionService
|
||||
|
||||
public async Task<IngestionSummary> RunAsync(CancellationToken ct = default)
|
||||
{
|
||||
var roles = await _db.Roles.Select(r => r.Name).ToListAsync(ct);
|
||||
var cities = await _db.Cities.Select(c => c.Name).ToListAsync(ct);
|
||||
var districts = await _db.Districts.Select(d => d.Name).ToListAsync(ct);
|
||||
var settings = await _settings.GetAsync();
|
||||
var roles = await _db.Roles.ToListAsync(ct);
|
||||
var cities = await _db.Cities.ToListAsync(ct);
|
||||
var districts = await _db.Districts.ToListAsync(ct);
|
||||
var roleNames = roles.Select(r => r.Name).ToList();
|
||||
var cityNames = cities.Select(c => c.Name).ToList();
|
||||
var districtNames = districts.Select(d => d.Name).ToList();
|
||||
|
||||
var results = new List<SourceResult>();
|
||||
|
||||
foreach (var source in _sources.Where(s => s.Enabled))
|
||||
{
|
||||
int fetched = 0, queued = 0, flagged = 0, spam = 0, dupes = 0;
|
||||
int fetched = 0, queued = 0, published = 0, flagged = 0, spam = 0, dupes = 0;
|
||||
IReadOnlyList<ScrapedItem> items;
|
||||
try { items = await source.FetchAsync(ct); }
|
||||
catch (Exception ex) { _log.LogError(ex, "Source {Source} fetch failed", source.Name); continue; }
|
||||
catch (Exception ex) { _log.LogError(ex, "Source {Source} failed", source.Name); continue; }
|
||||
|
||||
foreach (var item in items)
|
||||
{
|
||||
@@ -66,42 +71,155 @@ public class IngestionService
|
||||
var hash = Hash(item.RawText);
|
||||
if (await _db.RawListings.AnyAsync(r => r.ContentHash == hash, ct)) { dupes++; continue; }
|
||||
|
||||
var parsed = _parser.Parse(item.RawText, roles, cities, districts);
|
||||
var parsed = _parser.Parse(item.RawText, roleNames, cityNames, districtNames);
|
||||
var val = _validator.Validate(item.RawText, parsed);
|
||||
|
||||
var status = val.IsSpam ? RawListingStatus.Discarded
|
||||
: val.IsValid ? RawListingStatus.New
|
||||
: RawListingStatus.Flagged;
|
||||
if (status == RawListingStatus.New) queued++;
|
||||
else if (status == RawListingStatus.Flagged) flagged++;
|
||||
else spam++;
|
||||
AiAuditResult? ai = null;
|
||||
if (settings.AiEnabled && !val.IsSpam)
|
||||
ai = await _ai.AuditAsync(item.RawText, settings, ct);
|
||||
|
||||
_db.RawListings.Add(new RawListing
|
||||
var (status, reason, confidence) = Decide(settings, val, ai);
|
||||
|
||||
var raw = new RawListing
|
||||
{
|
||||
SourceChannel = item.Source,
|
||||
SourceUrl = item.SourceUrl,
|
||||
RawText = item.RawText.Trim(),
|
||||
ContentHash = hash,
|
||||
Confidence = val.Confidence,
|
||||
ValidationNotes = val.Issues.Count > 0 ? string.Join("؛ ", val.Issues) : null,
|
||||
Confidence = confidence,
|
||||
ValidationNotes = reason,
|
||||
Status = status,
|
||||
});
|
||||
};
|
||||
_db.RawListings.Add(raw);
|
||||
|
||||
if (status == RawListingStatus.Normalized)
|
||||
{
|
||||
try { Publish(parsed, ai, raw, roles, cities, districts); published++; }
|
||||
catch (Exception ex) { _log.LogWarning(ex, "Auto-publish failed; queueing instead"); raw.Status = RawListingStatus.New; queued++; }
|
||||
}
|
||||
else if (status == RawListingStatus.New) queued++;
|
||||
else if (status == RawListingStatus.Flagged) flagged++;
|
||||
else spam++;
|
||||
}
|
||||
|
||||
await _db.SaveChangesAsync(ct);
|
||||
results.Add(new SourceResult(source.Name, fetched, queued, flagged, spam, dupes));
|
||||
_log.LogInformation("Ingestion {Source}: fetched={F} queued={Q} flagged={Fl} spam={S} dupes={D}",
|
||||
source.Name, fetched, queued, flagged, spam, dupes);
|
||||
results.Add(new SourceResult(source.Name, fetched, queued, published, flagged, spam, dupes));
|
||||
_log.LogInformation("Ingest {S}: fetched={F} queued={Q} published={P} flagged={Fl} spam={Sp} dupes={D}",
|
||||
source.Name, fetched, queued, published, flagged, spam, dupes);
|
||||
}
|
||||
|
||||
return new IngestionSummary(results);
|
||||
}
|
||||
|
||||
/// <summary>SHA-256 hex of the whitespace-normalized text (for cross-run dedupe).</summary>
|
||||
private static (RawListingStatus status, string? reason, int confidence) Decide(
|
||||
AppSetting s, ValidationResult val, AiAuditResult? ai)
|
||||
{
|
||||
var notes = val.Issues.Count > 0 ? string.Join("؛ ", val.Issues) : null;
|
||||
|
||||
if (val.IsSpam)
|
||||
return (RawListingStatus.Discarded, Join("اسپم", notes), val.Confidence);
|
||||
|
||||
if (ai is not null)
|
||||
{
|
||||
var aiNote = Join($"AI: {ai.Decision} ({ai.Confidence}٪)" + (ai.Reason is null ? "" : $" — {ai.Reason}"), notes);
|
||||
if (ai.Reject) return (RawListingStatus.Discarded, aiNote, ai.Confidence);
|
||||
if (ai.Approve)
|
||||
return (s.Mode == IngestionMode.Automatic && s.AiAutoApprove
|
||||
? RawListingStatus.Normalized : RawListingStatus.New, aiNote, ai.Confidence);
|
||||
return (RawListingStatus.Flagged, aiNote, ai.Confidence); // review
|
||||
}
|
||||
|
||||
if (!val.IsValid) return (RawListingStatus.Flagged, notes, val.Confidence);
|
||||
if (s.Mode == IngestionMode.Automatic && val.Confidence >= s.AutoPublishMinConfidence)
|
||||
return (RawListingStatus.Normalized, notes, val.Confidence);
|
||||
return (RawListingStatus.New, notes, val.Confidence);
|
||||
}
|
||||
|
||||
private void Publish(ParsedListing parsed, AiAuditResult? ai, RawListing raw,
|
||||
List<Role> roles, List<City> cities, List<District> districts)
|
||||
{
|
||||
var d = ai?.Data;
|
||||
var roleName = d?.Role ?? parsed.RoleName;
|
||||
var cityName = d?.City ?? parsed.CityName;
|
||||
var districtName = d?.District ?? parsed.DistrictName;
|
||||
|
||||
var role = roles.FirstOrDefault(r => r.Name == roleName) ?? roles.First();
|
||||
var city = cities.FirstOrDefault(c => c.Name == cityName)
|
||||
?? cities.FirstOrDefault(c => c.IsActive) ?? cities.First();
|
||||
var district = districts.FirstOrDefault(x => x.Name == districtName && x.CityId == city.Id);
|
||||
|
||||
var facilityName = !string.IsNullOrWhiteSpace(d?.FacilityName) ? d!.FacilityName!.Trim()
|
||||
: $"مرکز درمانی (از {raw.SourceChannel})";
|
||||
var facility = _db.Facilities.Local.FirstOrDefault(f => f.Name == facilityName && f.CityId == city.Id)
|
||||
?? _db.Facilities.FirstOrDefault(f => f.Name == facilityName && f.CityId == city.Id);
|
||||
if (facility is null)
|
||||
{
|
||||
facility = new Facility
|
||||
{
|
||||
Name = facilityName, Type = FacilityType.Clinic, City = city, DistrictId = district?.Id,
|
||||
Phone = parsed.Phone, IsVerified = false,
|
||||
};
|
||||
_db.Facilities.Add(facility);
|
||||
}
|
||||
|
||||
var kind = (d?.Kind ?? parsed.Kind.ToString()).ToLowerInvariant();
|
||||
if (kind.Contains("job") || kind.Contains("استخدام"))
|
||||
{
|
||||
_db.JobOpenings.Add(new JobOpening
|
||||
{
|
||||
Facility = facility, Role = role,
|
||||
Title = !string.IsNullOrWhiteSpace(d?.Title) ? d!.Title!.Trim() : $"استخدام {role.Name}",
|
||||
EmploymentType = MapEmployment(d?.EmploymentType, parsed.EmploymentType),
|
||||
SalaryMin = parsed.PayAmount,
|
||||
Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated,
|
||||
SourceUrl = raw.SourceUrl,
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
var st = MapShiftType(d?.ShiftType, parsed.ShiftType);
|
||||
var (start, end) = DefaultTimes(st);
|
||||
_db.Shifts.Add(new Shift
|
||||
{
|
||||
Facility = facility, Role = role,
|
||||
Date = DateOnly.FromDateTime(DateTime.UtcNow).AddDays(1),
|
||||
StartTime = start, EndTime = end, ShiftType = st,
|
||||
SpecialtyRequired = role.Name, Description = raw.RawText,
|
||||
PayType = parsed.SharePercent is not null && parsed.PayAmount is null ? PayType.Percentage
|
||||
: parsed.PayAmount is null ? PayType.Negotiable : PayType.PerShift,
|
||||
PayAmount = parsed.PayAmount, SharePercent = parsed.SharePercent,
|
||||
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
|
||||
});
|
||||
}
|
||||
raw.Status = RawListingStatus.Normalized;
|
||||
}
|
||||
|
||||
private static ShiftType MapShiftType(string? ai, ShiftType? parsed) => (ai?.ToLowerInvariant()) switch
|
||||
{
|
||||
"day" => ShiftType.Day, "evening" => ShiftType.Evening, "night" => ShiftType.Night, "oncall" => ShiftType.OnCall,
|
||||
_ => parsed ?? ShiftType.Day,
|
||||
};
|
||||
|
||||
private static EmploymentType MapEmployment(string? ai, EmploymentType? parsed) => (ai?.ToLowerInvariant()) switch
|
||||
{
|
||||
"parttime" => EmploymentType.PartTime, "contract" => EmploymentType.Contract,
|
||||
"plan" => EmploymentType.Plan, "fulltime" => EmploymentType.FullTime,
|
||||
_ => parsed ?? EmploymentType.FullTime,
|
||||
};
|
||||
|
||||
private static (TimeOnly, TimeOnly) DefaultTimes(ShiftType t) => t switch
|
||||
{
|
||||
ShiftType.Day => (new TimeOnly(8, 0), new TimeOnly(14, 0)),
|
||||
ShiftType.Evening => (new TimeOnly(14, 0), new TimeOnly(20, 0)),
|
||||
ShiftType.Night => (new TimeOnly(20, 0), new TimeOnly(8, 0)),
|
||||
_ => (new TimeOnly(8, 0), new TimeOnly(8, 0)),
|
||||
};
|
||||
|
||||
private static string? Join(string a, string? b) => string.IsNullOrEmpty(b) ? a : $"{a} | {b}";
|
||||
|
||||
private static string Hash(string text)
|
||||
{
|
||||
var normalized = Regex.Replace((text ?? "").Trim(), @"\s+", " ");
|
||||
var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(normalized));
|
||||
return Convert.ToHexString(bytes).ToLowerInvariant();
|
||||
return Convert.ToHexString(SHA256.HashData(Encoding.UTF8.GetBytes(normalized))).ToLowerInvariant();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,40 @@
|
||||
using JobsMedical.Web.Data;
|
||||
using JobsMedical.Web.Models;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
|
||||
/// <summary>Loads/creates the single platform-settings row (Id=1).</summary>
|
||||
public class SettingsService
|
||||
{
|
||||
private readonly AppDbContext _db;
|
||||
public SettingsService(AppDbContext db) => _db = db;
|
||||
|
||||
public async Task<AppSetting> GetAsync()
|
||||
{
|
||||
var s = await _db.AppSettings.FirstOrDefaultAsync(x => x.Id == 1);
|
||||
if (s is null)
|
||||
{
|
||||
s = new AppSetting { Id = 1 };
|
||||
_db.AppSettings.Add(s);
|
||||
await _db.SaveChangesAsync();
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
public async Task SaveAsync(AppSetting incoming)
|
||||
{
|
||||
var s = await GetAsync();
|
||||
s.Mode = incoming.Mode;
|
||||
s.AutoPublishMinConfidence = Math.Clamp(incoming.AutoPublishMinConfidence, 0, 100);
|
||||
s.AiEnabled = incoming.AiEnabled;
|
||||
s.AiEndpoint = incoming.AiEndpoint?.Trim();
|
||||
s.AiApiKey = incoming.AiApiKey?.Trim();
|
||||
s.AiModel = incoming.AiModel?.Trim();
|
||||
s.AiSystemPrompt = string.IsNullOrWhiteSpace(incoming.AiSystemPrompt)
|
||||
? AppSetting.DefaultPrompt : incoming.AiSystemPrompt;
|
||||
s.AiAutoApprove = incoming.AiAutoApprove;
|
||||
s.UpdatedAt = DateTime.UtcNow;
|
||||
await _db.SaveChangesAsync();
|
||||
}
|
||||
}
|
||||
@@ -1,3 +1,5 @@
|
||||
using System.Net;
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace JobsMedical.Web.Services.Scraping;
|
||||
@@ -5,40 +7,71 @@ namespace JobsMedical.Web.Services.Scraping;
|
||||
public class TelegramOptions
|
||||
{
|
||||
public bool Enabled { get; set; }
|
||||
public string? BotToken { get; set; }
|
||||
public List<string> Channels { get; set; } = new(); // @channel handles to read
|
||||
public string? BotToken { get; set; } // optional (for private channels later)
|
||||
public List<string> Channels { get; set; } = new(); // public channel usernames (no @)
|
||||
public int PerChannel { get; set; } = 20;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Telegram/Bale channel source. Credential-ready: wire a bot token + channel list in config
|
||||
/// (Ingestion:Telegram) and implement the fetch against the Bot API (getUpdates / channel posts)
|
||||
/// or a userbot. Dormant until enabled, so the engine runs without it.
|
||||
/// Reads public Telegram channels via the web preview (https://t.me/s/<channel>) — no bot
|
||||
/// token or login needed for public channels. Each message's text becomes a ScrapedItem.
|
||||
/// </summary>
|
||||
public class TelegramListingSource : IListingSource
|
||||
{
|
||||
private readonly TelegramOptions _opts;
|
||||
private readonly IHttpClientFactory _http;
|
||||
private readonly ILogger<TelegramListingSource> _log;
|
||||
|
||||
public TelegramListingSource(IOptions<TelegramOptions> opts, ILogger<TelegramListingSource> log)
|
||||
public TelegramListingSource(IOptions<TelegramOptions> opts, IHttpClientFactory http,
|
||||
ILogger<TelegramListingSource> log)
|
||||
{
|
||||
_opts = opts.Value;
|
||||
_http = http;
|
||||
_log = log;
|
||||
}
|
||||
|
||||
public string Name => "تلگرام/بله";
|
||||
public bool Enabled => _opts.Enabled && !string.IsNullOrWhiteSpace(_opts.BotToken) && _opts.Channels.Count > 0;
|
||||
public string Name => "تلگرام";
|
||||
public bool Enabled => _opts.Enabled && _opts.Channels.Count > 0;
|
||||
|
||||
public Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default)
|
||||
public async Task<IReadOnlyList<ScrapedItem>> FetchAsync(CancellationToken ct = default)
|
||||
{
|
||||
if (!Enabled)
|
||||
if (!Enabled) { _log.LogInformation("Telegram source disabled/unconfigured."); return Array.Empty<ScrapedItem>(); }
|
||||
|
||||
var client = _http.CreateClient("scrape");
|
||||
var items = new List<ScrapedItem>();
|
||||
foreach (var ch in _opts.Channels.Select(c => c.TrimStart('@')).Where(c => c.Length > 0))
|
||||
{
|
||||
_log.LogInformation("Telegram source not configured — skipping.");
|
||||
return Task.FromResult<IReadOnlyList<ScrapedItem>>(Array.Empty<ScrapedItem>());
|
||||
try
|
||||
{
|
||||
var html = await client.GetStringAsync($"https://t.me/s/{ch}", ct);
|
||||
foreach (var text in ExtractMessages(html).Take(_opts.PerChannel))
|
||||
items.Add(new ScrapedItem($"تلگرام/{ch}", text, $"https://t.me/{ch}"));
|
||||
}
|
||||
catch (Exception ex) { _log.LogWarning(ex, "Telegram fetch failed for {Channel}", ch); }
|
||||
}
|
||||
return items;
|
||||
}
|
||||
|
||||
// Message bodies live in <div class="tgme_widget_message_text ...">...</div>.
|
||||
private static IEnumerable<string> ExtractMessages(string html)
|
||||
{
|
||||
foreach (Match m in Regex.Matches(html,
|
||||
"<div class=\"tgme_widget_message_text[^\"]*\"[^>]*>(.*?)</div>", RegexOptions.Singleline))
|
||||
{
|
||||
var text = HtmlUtil.ToPlainText(m.Groups[1].Value);
|
||||
if (text.Length >= 15) yield return text;
|
||||
}
|
||||
// TODO(prod): call https://api.telegram.org/bot{token}/getUpdates (or channel history),
|
||||
// map each message to new ScrapedItem(Name, message.Text, messageLink). The validation +
|
||||
// dedupe pipeline downstream is already source-agnostic.
|
||||
_log.LogWarning("Telegram fetch not yet implemented; returning empty.");
|
||||
return Task.FromResult<IReadOnlyList<ScrapedItem>>(Array.Empty<ScrapedItem>());
|
||||
}
|
||||
}
|
||||
|
||||
internal static class HtmlUtil
|
||||
{
|
||||
public static string ToPlainText(string html)
|
||||
{
|
||||
var s = Regex.Replace(html, "<br\\s*/?>", "\n", RegexOptions.IgnoreCase);
|
||||
s = Regex.Replace(s, "<[^>]+>", ""); // strip remaining tags
|
||||
s = WebUtility.HtmlDecode(s);
|
||||
s = Regex.Replace(s, "[ \\t]+", " ");
|
||||
return s.Trim();
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user