Fix: site-wide phone on every Medjobs ad + phone mistaken for price
- HarvestPhones was run over the whole page, so Medjobs' own header/footer number (09101016110) was appended to every ad. Now harvest only the ad's description region in Medjobs + Website sources; the protected number still comes from the reveal call. No more duplicate number across ads. - The amount extractor read phone digits as a Toman price (۹,۱۰۱,۰۱۶,۱۱۰ تومان). The parser now strips «شماره تماس…» lines and mobile/landline numbers before extracting money, and only accepts 6–10 digit numbers with no leading zero (phones/ids start with 0 or are 11+). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -151,9 +151,10 @@ public class MedjobsListingSource : IListingSource
|
||||
var text = HtmlUtil.ToPlainText(string.Join("\n", parts));
|
||||
if (text.Length > 1800) text = text[..1800];
|
||||
|
||||
// The contact number is often outside the description (in a tel: link / data attribute the
|
||||
// page reveals on click). Harvest it from the full HTML and append so the parser/AI see it.
|
||||
var phones = HtmlUtil.HarvestPhones(html);
|
||||
// Only harvest a number written inside the ad's own DESCRIPTION — never the full page,
|
||||
// which carries the site's own header/footer number on every ad. The real protected
|
||||
// number comes from RevealPhonesAsync (the admin-ajax reveal).
|
||||
var phones = HtmlUtil.HarvestPhones(body ?? "");
|
||||
if (phones.Count > 0 && !phones.Any(text.Contains))
|
||||
text += "\nشماره تماس: " + string.Join("، ", phones);
|
||||
return text;
|
||||
|
||||
@@ -53,8 +53,9 @@ public class WebsiteListingSource : IListingSource
|
||||
var text = HtmlUtil.ToPlainText(string.Join("\n", new[] { title, body }.Where(x => !string.IsNullOrWhiteSpace(x))));
|
||||
if (text.Length > 1800) text = text[..1800];
|
||||
|
||||
// Append any contact number found in the full markup (tel:/data-phone/JSON-LD/inline).
|
||||
var phones = HtmlUtil.HarvestPhones(html);
|
||||
// Harvest a number from the ad's own content region only (not the whole page, which would
|
||||
// pick up the site's header/footer number on every listing).
|
||||
var phones = HtmlUtil.HarvestPhones(body ?? "");
|
||||
if (phones.Count > 0 && !phones.Any(text.Contains))
|
||||
text += "\nشماره تماس: " + string.Join("، ", phones);
|
||||
return text;
|
||||
|
||||
Reference in New Issue
Block a user