Locs(string xml)
=> Regex.Matches(xml, "([^<]+)").Select(m => m.Groups[1].Value.Trim());
/// Title (og:title, site suffix stripped) + body (entry/description content or og:description).
private static string ExtractAd(string html)
{
var title = Meta(html, "og:title");
if (title is not null)
{
var bar = title.IndexOf('|');
if (bar > 10) title = title[..bar].Trim();
}
string? body = BetweenClass(html, "rtcl-description")
?? BetweenClass(html, "entry-content")
?? Meta(html, "og:description");
var parts = new[] { title, body }.Where(p => !string.IsNullOrWhiteSpace(p));
var text = HtmlUtil.ToPlainText(string.Join("\n", parts));
return text.Length > 1800 ? text[..1800] : text;
}
private static string? Meta(string html, string prop)
{
var m = Regex.Match(html, $"]+property=[\"']{Regex.Escape(prop)}[\"'][^>]+content=[\"']([^\"']*)[\"']");
return m.Success ? System.Net.WebUtility.HtmlDecode(m.Groups[1].Value) : null;
}
/// Grab the inner HTML of the first <div class="...name..."> (best-effort).
private static string? BetweenClass(string html, string cls)
{
var m = Regex.Match(html, $"]+class=[\"'][^\"']*{Regex.Escape(cls)}[^\"']*[\"'][^>]*>(.*?)
",
RegexOptions.Singleline);
return m.Success ? m.Groups[1].Value : null;
}
}