Per-ad contacts for shifts/jobs, stale-applicant filter, review source link
CI/CD / CI · dotnet build (push) Successful in 1m3s
CI/CD / Deploy · hamkadr (push) Successful in 1m18s

Phone fix: shifts/jobs showed Facility.Phone, but unnamed ads all share one
placeholder facility, so every such listing displayed the same stale number
while the ad's real phone sat unused in the description. ContactMethod is now
attachable to a Shift/JobOpening (not just talent); ingestion stores the ad's
own number(s) on each listing and the detail pages render them (new
_ContactList partial), falling back to the facility phone only when the ad had
none. Migration ShiftJobContacts (nullable owner FKs) — auto-applies on deploy.

Stale applicants: skip «آماده به کار» posts older than 7 days at ingest, by the
source's real timestamp (Telegram <time>, Bale date) or a Persian time-ago
phrase in the text (Divar «۲ هفته پیش»). Recorded as Discarded; shifts/jobs
are not aged out.

Admin: Review page now shows a «مشاهده آگهی در منبع» link (RawListing.SourceUrl)
so the source post can be checked before publishing.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-10 21:28:12 +03:30
parent b71d8b362b
commit 38031cb189
18 changed files with 1943 additions and 47 deletions
@@ -36,17 +36,20 @@ public class BaleListingSource : IListingSource
var items = new List<ScrapedItem>();
foreach (var update in result.EnumerateArray())
{
var text = TextOf(update, "channel_post") ?? TextOf(update, "message");
if (!string.IsNullOrWhiteSpace(text) && text!.Trim().Length >= 15)
items.Add(new ScrapedItem("بله", text.Trim()));
var post = Msg(update, "channel_post") ?? Msg(update, "message");
if (post is not { } p) continue;
var text = p.TryGetProperty("text", out var t) && t.ValueKind == JsonValueKind.String ? t.GetString() : null;
if (string.IsNullOrWhiteSpace(text) || text!.Trim().Length < 15) continue;
// Bot API messages carry a unix `date` — keep it so stale posts can be aged out.
DateTime? postedAt = p.TryGetProperty("date", out var d) && d.ValueKind == JsonValueKind.Number && d.TryGetInt64(out var epoch)
? DateTimeOffset.FromUnixTimeSeconds(epoch).UtcDateTime : null;
items.Add(new ScrapedItem("بله", text.Trim(), PostedAt: postedAt));
}
return items;
}
catch (Exception ex) { _log.LogWarning(ex, "Bale fetch failed."); return Array.Empty<ScrapedItem>(); }
}
private static string? TextOf(JsonElement update, string key)
=> update.TryGetProperty(key, out var m)
&& m.TryGetProperty("text", out var t) && t.ValueKind == JsonValueKind.String
? t.GetString() : null;
private static JsonElement? Msg(JsonElement update, string key)
=> update.TryGetProperty(key, out var m) && m.ValueKind == JsonValueKind.Object ? m : null;
}
@@ -4,9 +4,11 @@ namespace JobsMedical.Web.Services.Scraping;
/// <summary>One raw post pulled from a source (a Telegram message, a Divar ad, etc.).
/// Lat/Lng are an APPROXIMATE location when the source exposes one (e.g. Divar's privacy-fuzzed
/// map center) — used to place an aggregated facility on the map / enable «near me».</summary>
/// map center) — used to place an aggregated facility on the map / enable «near me».
/// PostedAt is the post's ORIGINAL publish time when the source exposes it (Telegram &lt;time&gt;,
/// Bale message date…) — used to drop stale applicant ads at ingest. Null when unknown.</summary>
public record ScrapedItem(string Source, string RawText, string? SourceUrl = null,
double? Lat = null, double? Lng = null);
double? Lat = null, double? Lng = null, DateTime? PostedAt = null);
/// <summary>
/// A pluggable source the ingestion engine pulls from. Configuration (enabled, channels, tokens)
@@ -29,6 +29,10 @@ public record IngestionSummary(List<SourceResult> Sources)
/// </summary>
public class IngestionService
{
/// <summary>Applicant posts older than this (by the source's date, or a Persian "time ago"
/// phrase in the text) are skipped at ingest — availability goes stale fast.</summary>
private const int TalentMaxAgeDays = 7;
private readonly AppDbContext _db;
private readonly IEnumerable<IListingSource> _sources;
private readonly IListingParser _parser;
@@ -90,6 +94,22 @@ public class IngestionService
var parsed = _parser.Parse(item.RawText, roleNames, cityNames, districtNames);
var val = _validator.Validate(item.RawText, parsed);
// Drop STALE applicant («آماده به کار») posts — a person's availability goes cold fast.
// Age = the source's real timestamp, else a Persian "time ago" phrase in the text
// (Divar embeds «۲ هفته پیش»…). Recorded as Discarded (keeps the dedupe hash + audit
// trail; no AI spend). Shifts/jobs are NOT aged out — their dates are in the future.
if (parsed.Kind == ListingKind.Talent && PostAgeDays(item) is int age && age > TalentMaxAgeDays)
{
_db.RawListings.Add(new RawListing
{
SourceChannel = item.Source, SourceUrl = item.SourceUrl, RawText = item.RawText.Trim(),
ContentHash = hash, Confidence = 0, Status = RawListingStatus.Discarded,
ValidationNotes = $"آماده‌به‌کارِ قدیمی ({age} روز) — نادیده گرفته شد",
Lat = item.Lat, Lng = item.Lng,
});
spam++; continue;
}
AiAuditResult? ai = null;
if (settings.AiEnabled && !val.IsSpam)
ai = await _ai.AuditAsync(item.RawText, settings, ct);
@@ -280,6 +300,7 @@ public class IngestionService
SalaryMin = parsed.PayAmount,
Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated,
SourceUrl = raw.SourceUrl,
Contacts = BuildContacts(d, parsed), // the ad's OWN number(s) — fresh per listing
});
}
else
@@ -297,6 +318,7 @@ public class IngestionService
: parsed.PayAmount is null ? PayType.Negotiable : PayType.PerShift,
PayAmount = parsed.PayAmount, SharePercent = parsed.SharePercent,
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
Contacts = BuildContacts(d, parsed), // the ad's OWN number(s) — fresh per listing
});
}
raw.Status = RawListingStatus.Normalized;
@@ -449,4 +471,14 @@ public class IngestionService
var normalized = Regex.Replace((text ?? "").Trim(), @"\s+", " ");
return Convert.ToHexString(SHA256.HashData(Encoding.UTF8.GetBytes(normalized))).ToLowerInvariant();
}
/// <summary>Age of a post in whole days — from the source's real timestamp when present, else a
/// Persian "time ago" phrase in the text (Divar). Null when neither is available (= unknown age,
/// so it's NOT filtered out).</summary>
private static int? PostAgeDays(ScrapedItem item)
{
if (item.PostedAt is DateTime posted)
return Math.Max(0, (int)Math.Floor((DateTime.UtcNow - posted).TotalDays));
return HtmlUtil.AgeDaysFromPersianText(item.RawText);
}
}
@@ -33,21 +33,28 @@ public class TelegramListingSource : IListingSource
try
{
var html = await client.GetStringAsync($"https://t.me/s/{ch}", ct);
foreach (var text in ExtractMessages(html).Take(20))
items.Add(new ScrapedItem($"تلگرام/{ch}", text, $"https://t.me/{ch}"));
foreach (var (text, postedAt) in ExtractMessages(html).Take(20))
items.Add(new ScrapedItem($"تلگرام/{ch}", text, $"https://t.me/{ch}", PostedAt: postedAt));
}
catch (Exception ex) { _log.LogWarning(ex, "Telegram fetch failed for {Channel}", ch); }
}
return items;
}
private static IEnumerable<string> ExtractMessages(string html)
private static IEnumerable<(string text, DateTime? postedAt)> ExtractMessages(string html)
{
foreach (Match m in Regex.Matches(html,
"<div class=\"tgme_widget_message_text[^\"]*\"[^>]*>(.*?)</div>", RegexOptions.Singleline))
{
var text = HtmlUtil.ToPlainText(m.Groups[1].Value);
if (text.Length >= 15) yield return text;
if (text.Length < 15) continue;
// The message's date link (<time datetime="…">) follows its text in the same bubble —
// grab the nearest one after this match.
DateTime? postedAt = null;
var tail = html.Substring(m.Index + m.Length, Math.Min(2000, html.Length - (m.Index + m.Length)));
var dm = Regex.Match(tail, "datetime=\"([^\"]+)\"");
if (dm.Success && DateTimeOffset.TryParse(dm.Groups[1].Value, out var dto)) postedAt = dto.UtcDateTime;
yield return (text, postedAt);
}
}
}
@@ -63,6 +70,30 @@ internal static class HtmlUtil
return s.Trim();
}
/// <summary>Best-effort age (in days) of a post from a Persian "time ago" phrase in its text
/// («دیروز»، «۳ روز پیش»، «هفته پیش»، «۲ هفته پیش»، «ماه پیش»…). Divar embeds this in the row
/// text, so we can age-filter it without a real timestamp. Now/minutes/hours → 0; null when no
/// such phrase is present (caller then treats age as unknown).</summary>
public static int? AgeDaysFromPersianText(string? text)
{
if (string.IsNullOrEmpty(text)) return null;
var t = ToLatinDigits(text);
if (Regex.IsMatch(t, "لحظات|هم[‌ ]?اکنون|چند لحظه|دقیقه پیش|دقایقی پیش|ساعت پیش|ساعتی پیش")) return 0;
if (t.Contains("پریروز")) return 2;
if (t.Contains("دیروز")) return 1;
var m = Regex.Match(t, @"(\d+)\s*(روز|هفته|ماه|سال)\s*پیش");
if (m.Success)
{
var n = int.Parse(m.Groups[1].Value);
return m.Groups[2].Value switch
{ "روز" => n, "هفته" => n * 7, "ماه" => n * 30, "سال" => n * 365, _ => (int?)null };
}
if (Regex.IsMatch(t, @"هفته\s*پیش")) return 7; // bare «هفته پیش» = ۱ هفته
if (Regex.IsMatch(t, @"ماه\s*پیش")) return 30;
if (Regex.IsMatch(t, @"سال\s*پیش") || t.Contains("پارسال")) return 365;
return null;
}
/// <summary>Convert Persian/Arabic-Indic digits to Latin.</summary>
public static string ToLatinDigits(string s)
{