Per-ad contacts for shifts/jobs, stale-applicant filter, review source link
Phone fix: shifts/jobs showed Facility.Phone, but unnamed ads all share one placeholder facility, so every such listing displayed the same stale number while the ad's real phone sat unused in the description. ContactMethod is now attachable to a Shift/JobOpening (not just talent); ingestion stores the ad's own number(s) on each listing and the detail pages render them (new _ContactList partial), falling back to the facility phone only when the ad had none. Migration ShiftJobContacts (nullable owner FKs) — auto-applies on deploy. Stale applicants: skip «آماده به کار» posts older than 7 days at ingest, by the source's real timestamp (Telegram <time>, Bale date) or a Persian time-ago phrase in the text (Divar «۲ هفته پیش»). Recorded as Discarded; shifts/jobs are not aged out. Admin: Review page now shows a «مشاهده آگهی در منبع» link (RawListing.SourceUrl) so the source post can be checked before publishing. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -36,17 +36,20 @@ public class BaleListingSource : IListingSource
|
||||
var items = new List<ScrapedItem>();
|
||||
foreach (var update in result.EnumerateArray())
|
||||
{
|
||||
var text = TextOf(update, "channel_post") ?? TextOf(update, "message");
|
||||
if (!string.IsNullOrWhiteSpace(text) && text!.Trim().Length >= 15)
|
||||
items.Add(new ScrapedItem("بله", text.Trim()));
|
||||
var post = Msg(update, "channel_post") ?? Msg(update, "message");
|
||||
if (post is not { } p) continue;
|
||||
var text = p.TryGetProperty("text", out var t) && t.ValueKind == JsonValueKind.String ? t.GetString() : null;
|
||||
if (string.IsNullOrWhiteSpace(text) || text!.Trim().Length < 15) continue;
|
||||
// Bot API messages carry a unix `date` — keep it so stale posts can be aged out.
|
||||
DateTime? postedAt = p.TryGetProperty("date", out var d) && d.ValueKind == JsonValueKind.Number && d.TryGetInt64(out var epoch)
|
||||
? DateTimeOffset.FromUnixTimeSeconds(epoch).UtcDateTime : null;
|
||||
items.Add(new ScrapedItem("بله", text.Trim(), PostedAt: postedAt));
|
||||
}
|
||||
return items;
|
||||
}
|
||||
catch (Exception ex) { _log.LogWarning(ex, "Bale fetch failed."); return Array.Empty<ScrapedItem>(); }
|
||||
}
|
||||
|
||||
private static string? TextOf(JsonElement update, string key)
|
||||
=> update.TryGetProperty(key, out var m)
|
||||
&& m.TryGetProperty("text", out var t) && t.ValueKind == JsonValueKind.String
|
||||
? t.GetString() : null;
|
||||
private static JsonElement? Msg(JsonElement update, string key)
|
||||
=> update.TryGetProperty(key, out var m) && m.ValueKind == JsonValueKind.Object ? m : null;
|
||||
}
|
||||
|
||||
@@ -4,9 +4,11 @@ namespace JobsMedical.Web.Services.Scraping;
|
||||
|
||||
/// <summary>One raw post pulled from a source (a Telegram message, a Divar ad, etc.).
|
||||
/// Lat/Lng are an APPROXIMATE location when the source exposes one (e.g. Divar's privacy-fuzzed
|
||||
/// map center) — used to place an aggregated facility on the map / enable «near me».</summary>
|
||||
/// map center) — used to place an aggregated facility on the map / enable «near me».
|
||||
/// PostedAt is the post's ORIGINAL publish time when the source exposes it (Telegram <time>,
|
||||
/// Bale message date…) — used to drop stale applicant ads at ingest. Null when unknown.</summary>
|
||||
public record ScrapedItem(string Source, string RawText, string? SourceUrl = null,
|
||||
double? Lat = null, double? Lng = null);
|
||||
double? Lat = null, double? Lng = null, DateTime? PostedAt = null);
|
||||
|
||||
/// <summary>
|
||||
/// A pluggable source the ingestion engine pulls from. Configuration (enabled, channels, tokens)
|
||||
|
||||
@@ -29,6 +29,10 @@ public record IngestionSummary(List<SourceResult> Sources)
|
||||
/// </summary>
|
||||
public class IngestionService
|
||||
{
|
||||
/// <summary>Applicant posts older than this (by the source's date, or a Persian "time ago"
|
||||
/// phrase in the text) are skipped at ingest — availability goes stale fast.</summary>
|
||||
private const int TalentMaxAgeDays = 7;
|
||||
|
||||
private readonly AppDbContext _db;
|
||||
private readonly IEnumerable<IListingSource> _sources;
|
||||
private readonly IListingParser _parser;
|
||||
@@ -90,6 +94,22 @@ public class IngestionService
|
||||
var parsed = _parser.Parse(item.RawText, roleNames, cityNames, districtNames);
|
||||
var val = _validator.Validate(item.RawText, parsed);
|
||||
|
||||
// Drop STALE applicant («آماده به کار») posts — a person's availability goes cold fast.
|
||||
// Age = the source's real timestamp, else a Persian "time ago" phrase in the text
|
||||
// (Divar embeds «۲ هفته پیش»…). Recorded as Discarded (keeps the dedupe hash + audit
|
||||
// trail; no AI spend). Shifts/jobs are NOT aged out — their dates are in the future.
|
||||
if (parsed.Kind == ListingKind.Talent && PostAgeDays(item) is int age && age > TalentMaxAgeDays)
|
||||
{
|
||||
_db.RawListings.Add(new RawListing
|
||||
{
|
||||
SourceChannel = item.Source, SourceUrl = item.SourceUrl, RawText = item.RawText.Trim(),
|
||||
ContentHash = hash, Confidence = 0, Status = RawListingStatus.Discarded,
|
||||
ValidationNotes = $"آمادهبهکارِ قدیمی ({age} روز) — نادیده گرفته شد",
|
||||
Lat = item.Lat, Lng = item.Lng,
|
||||
});
|
||||
spam++; continue;
|
||||
}
|
||||
|
||||
AiAuditResult? ai = null;
|
||||
if (settings.AiEnabled && !val.IsSpam)
|
||||
ai = await _ai.AuditAsync(item.RawText, settings, ct);
|
||||
@@ -280,6 +300,7 @@ public class IngestionService
|
||||
SalaryMin = parsed.PayAmount,
|
||||
Description = raw.RawText, Status = ShiftStatus.Open, Source = ShiftSource.Aggregated,
|
||||
SourceUrl = raw.SourceUrl,
|
||||
Contacts = BuildContacts(d, parsed), // the ad's OWN number(s) — fresh per listing
|
||||
});
|
||||
}
|
||||
else
|
||||
@@ -297,6 +318,7 @@ public class IngestionService
|
||||
: parsed.PayAmount is null ? PayType.Negotiable : PayType.PerShift,
|
||||
PayAmount = parsed.PayAmount, SharePercent = parsed.SharePercent,
|
||||
Status = ShiftStatus.Open, Source = ShiftSource.Aggregated, SourceUrl = raw.SourceUrl,
|
||||
Contacts = BuildContacts(d, parsed), // the ad's OWN number(s) — fresh per listing
|
||||
});
|
||||
}
|
||||
raw.Status = RawListingStatus.Normalized;
|
||||
@@ -449,4 +471,14 @@ public class IngestionService
|
||||
var normalized = Regex.Replace((text ?? "").Trim(), @"\s+", " ");
|
||||
return Convert.ToHexString(SHA256.HashData(Encoding.UTF8.GetBytes(normalized))).ToLowerInvariant();
|
||||
}
|
||||
|
||||
/// <summary>Age of a post in whole days — from the source's real timestamp when present, else a
|
||||
/// Persian "time ago" phrase in the text (Divar). Null when neither is available (= unknown age,
|
||||
/// so it's NOT filtered out).</summary>
|
||||
private static int? PostAgeDays(ScrapedItem item)
|
||||
{
|
||||
if (item.PostedAt is DateTime posted)
|
||||
return Math.Max(0, (int)Math.Floor((DateTime.UtcNow - posted).TotalDays));
|
||||
return HtmlUtil.AgeDaysFromPersianText(item.RawText);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,21 +33,28 @@ public class TelegramListingSource : IListingSource
|
||||
try
|
||||
{
|
||||
var html = await client.GetStringAsync($"https://t.me/s/{ch}", ct);
|
||||
foreach (var text in ExtractMessages(html).Take(20))
|
||||
items.Add(new ScrapedItem($"تلگرام/{ch}", text, $"https://t.me/{ch}"));
|
||||
foreach (var (text, postedAt) in ExtractMessages(html).Take(20))
|
||||
items.Add(new ScrapedItem($"تلگرام/{ch}", text, $"https://t.me/{ch}", PostedAt: postedAt));
|
||||
}
|
||||
catch (Exception ex) { _log.LogWarning(ex, "Telegram fetch failed for {Channel}", ch); }
|
||||
}
|
||||
return items;
|
||||
}
|
||||
|
||||
private static IEnumerable<string> ExtractMessages(string html)
|
||||
private static IEnumerable<(string text, DateTime? postedAt)> ExtractMessages(string html)
|
||||
{
|
||||
foreach (Match m in Regex.Matches(html,
|
||||
"<div class=\"tgme_widget_message_text[^\"]*\"[^>]*>(.*?)</div>", RegexOptions.Singleline))
|
||||
{
|
||||
var text = HtmlUtil.ToPlainText(m.Groups[1].Value);
|
||||
if (text.Length >= 15) yield return text;
|
||||
if (text.Length < 15) continue;
|
||||
// The message's date link (<time datetime="…">) follows its text in the same bubble —
|
||||
// grab the nearest one after this match.
|
||||
DateTime? postedAt = null;
|
||||
var tail = html.Substring(m.Index + m.Length, Math.Min(2000, html.Length - (m.Index + m.Length)));
|
||||
var dm = Regex.Match(tail, "datetime=\"([^\"]+)\"");
|
||||
if (dm.Success && DateTimeOffset.TryParse(dm.Groups[1].Value, out var dto)) postedAt = dto.UtcDateTime;
|
||||
yield return (text, postedAt);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -63,6 +70,30 @@ internal static class HtmlUtil
|
||||
return s.Trim();
|
||||
}
|
||||
|
||||
/// <summary>Best-effort age (in days) of a post from a Persian "time ago" phrase in its text
|
||||
/// («دیروز»، «۳ روز پیش»، «هفته پیش»، «۲ هفته پیش»، «ماه پیش»…). Divar embeds this in the row
|
||||
/// text, so we can age-filter it without a real timestamp. Now/minutes/hours → 0; null when no
|
||||
/// such phrase is present (caller then treats age as unknown).</summary>
|
||||
public static int? AgeDaysFromPersianText(string? text)
|
||||
{
|
||||
if (string.IsNullOrEmpty(text)) return null;
|
||||
var t = ToLatinDigits(text);
|
||||
if (Regex.IsMatch(t, "لحظات|هم[ ]?اکنون|چند لحظه|دقیقه پیش|دقایقی پیش|ساعت پیش|ساعتی پیش")) return 0;
|
||||
if (t.Contains("پریروز")) return 2;
|
||||
if (t.Contains("دیروز")) return 1;
|
||||
var m = Regex.Match(t, @"(\d+)\s*(روز|هفته|ماه|سال)\s*پیش");
|
||||
if (m.Success)
|
||||
{
|
||||
var n = int.Parse(m.Groups[1].Value);
|
||||
return m.Groups[2].Value switch
|
||||
{ "روز" => n, "هفته" => n * 7, "ماه" => n * 30, "سال" => n * 365, _ => (int?)null };
|
||||
}
|
||||
if (Regex.IsMatch(t, @"هفته\s*پیش")) return 7; // bare «هفته پیش» = ۱ هفته
|
||||
if (Regex.IsMatch(t, @"ماه\s*پیش")) return 30;
|
||||
if (Regex.IsMatch(t, @"سال\s*پیش") || t.Contains("پارسال")) return 365;
|
||||
return null;
|
||||
}
|
||||
|
||||
/// <summary>Convert Persian/Arabic-Indic digits to Latin.</summary>
|
||||
public static string ToLatinDigits(string s)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user