From 931b7b6ffbd9848c8109d8328cb9f15c90760147 Mon Sep 17 00:00:00 2001 From: "soroush.asadi" Date: Wed, 3 Jun 2026 08:18:19 +0330 Subject: [PATCH] Add scrape/ingestion engine + validation, and 24h shift hour-range visualization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Scrape engine (Services/Scraping/): pluggable IListingSource (working sample + Telegram/Divar credential-ready stubs) → IngestionService (content-hash dedupe → parse → validate → review queue) → ListingValidator (completeness score + spam screen) → IngestionWorker (config-gated hosted service). RawListing gains ContentHash/Confidence/ValidationNotes; RawListingStatus.Flagged. Admin /Admin gets run-now, source list, confidence + flagged queue. Hour-range viz: _HourBar 24h timeline bar (colored by type, overnight wrap) on shift cards, recommendation cards, and detail. Co-Authored-By: Claude Opus 4.8 --- README.md | 20 +- src/JobsMedical.Web/Data/AppDbContext.cs | 4 + ...20260603044159_IngestionFields.Designer.cs | 788 ++++++++++++++++++ .../20260603044159_IngestionFields.cs | 69 ++ .../Migrations/AppDbContextModelSnapshot.cs | 15 + src/JobsMedical.Web/Models/Enums.cs | 7 +- src/JobsMedical.Web/Models/RawListing.cs | 11 + src/JobsMedical.Web/Pages/Admin/Index.cshtml | 62 +- .../Pages/Admin/Index.cshtml.cs | 27 +- .../Pages/Shared/_HourBar.cshtml | 42 + .../Pages/Shared/_RawListingRow.cshtml | 20 + .../Pages/Shared/_RecommendationCard.cshtml | 1 + .../Pages/Shared/_ShiftCard.cshtml | 1 + .../Pages/Shifts/Details.cshtml | 4 + src/JobsMedical.Web/Program.cs | 17 + .../Services/Scraping/DivarListingSource.cs | 42 + .../Services/Scraping/IListingSource.cs | 15 + .../Services/Scraping/IngestionService.cs | 107 +++ .../Services/Scraping/IngestionWorker.cs | 59 ++ .../Services/Scraping/ListingValidator.cs | 63 ++ .../Services/Scraping/SampleListingSource.cs | 27 + .../Scraping/TelegramListingSource.cs | 44 + src/JobsMedical.Web/appsettings.json | 6 + src/JobsMedical.Web/wwwroot/css/site.css | 14 + 24 files changed, 1439 insertions(+), 26 deletions(-) create mode 100644 src/JobsMedical.Web/Migrations/20260603044159_IngestionFields.Designer.cs create mode 100644 src/JobsMedical.Web/Migrations/20260603044159_IngestionFields.cs create mode 100644 src/JobsMedical.Web/Pages/Shared/_HourBar.cshtml create mode 100644 src/JobsMedical.Web/Pages/Shared/_RawListingRow.cshtml create mode 100644 src/JobsMedical.Web/Services/Scraping/DivarListingSource.cs create mode 100644 src/JobsMedical.Web/Services/Scraping/IListingSource.cs create mode 100644 src/JobsMedical.Web/Services/Scraping/IngestionService.cs create mode 100644 src/JobsMedical.Web/Services/Scraping/IngestionWorker.cs create mode 100644 src/JobsMedical.Web/Services/Scraping/ListingValidator.cs create mode 100644 src/JobsMedical.Web/Services/Scraping/SampleListingSource.cs create mode 100644 src/JobsMedical.Web/Services/Scraping/TelegramListingSource.cs diff --git a/README.md b/README.md index 03f075b..265e48f 100644 --- a/README.md +++ b/README.md @@ -75,11 +75,21 @@ Shifts support fixed (مقطوع), hourly (ساعتی), **profit-share (درصد centralizes the display; `Shift.SharePercent` holds the percentage; the listing-parser detects "۵۰٪ / درصد / سهم" from raw posts; and `/Shifts` has a "سهم درآمد" filter. -### Listing parser (Stage 1) -`IListingParser` / `HeuristicListingParser` extracts kind (shift vs hire), role, shift type, -employment type, pay, city/district, and phone from a raw Persian post via keyword + regex -heuristics — **no AI dependency** (LLM APIs are blocked from Iran). Admin reviews the prefilled -form and publishes. Swap in an `LlmListingParser` later behind the same interface. +### Scrape / ingestion engine +Pluggable `IListingSource`s (working `SampleListingSource`; credential-ready `Telegram`/`Divar` +stubs) → `IngestionService` **dedupes by content hash → parses → validates → enqueues** as +`RawListing` (status New / Flagged / Discarded-spam) with a confidence score. `ListingValidator` +scores completeness (role, location, pay, phone, length) and screens spam. `IngestionWorker` +(hosted, config-gated `Ingestion:Enabled`) runs it on a timer; admins can also run it on demand +from `/Admin`. `IListingParser` / `HeuristicListingParser` does the field extraction (kind, role, +shift type, employment, pay, **profit-share %**, city/district, phone) — **no AI dependency** (LLM +APIs are blocked from Iran). Admin reviews the prefilled form and publishes. Swap an +`LlmListingParser`/real sources behind the same interfaces later. + +### Hour-range visualization +Every shift card, recommendation card, and detail page shows a **24-hour timeline bar** +(`_HourBar`) with the shift's hours filled and colored by type; overnight shifts wrap past +midnight into two segments. ### Auth Phone OTP via `OtpService` (in-memory codes; dev shows the code on screen — wire Kavenegar/SMS.ir diff --git a/src/JobsMedical.Web/Data/AppDbContext.cs b/src/JobsMedical.Web/Data/AppDbContext.cs index 288a8ce..4550f75 100644 --- a/src/JobsMedical.Web/Data/AppDbContext.cs +++ b/src/JobsMedical.Web/Data/AppDbContext.cs @@ -108,5 +108,9 @@ public class AppDbContext : DbContext .HasForeignKey(j => j.FacilityId).OnDelete(DeleteBehavior.Cascade); b.Entity().HasIndex(j => j.Status); b.Entity().HasIndex(j => j.FacilityId); + + // Dedupe ingested listings by content hash. + b.Entity().HasIndex(r => r.ContentHash); + b.Entity().HasIndex(r => r.Status); } } diff --git a/src/JobsMedical.Web/Migrations/20260603044159_IngestionFields.Designer.cs b/src/JobsMedical.Web/Migrations/20260603044159_IngestionFields.Designer.cs new file mode 100644 index 0000000..781d95d --- /dev/null +++ b/src/JobsMedical.Web/Migrations/20260603044159_IngestionFields.Designer.cs @@ -0,0 +1,788 @@ +// +using System; +using JobsMedical.Web.Data; +using Microsoft.EntityFrameworkCore; +using Microsoft.EntityFrameworkCore.Infrastructure; +using Microsoft.EntityFrameworkCore.Migrations; +using Microsoft.EntityFrameworkCore.Storage.ValueConversion; +using Npgsql.EntityFrameworkCore.PostgreSQL.Metadata; + +#nullable disable + +namespace JobsMedical.Web.Migrations +{ + [DbContext(typeof(AppDbContext))] + [Migration("20260603044159_IngestionFields")] + partial class IngestionFields + { + /// + protected override void BuildTargetModel(ModelBuilder modelBuilder) + { +#pragma warning disable 612, 618 + modelBuilder + .HasAnnotation("ProductVersion", "10.0.0") + .HasAnnotation("Relational:MaxIdentifierLength", 63); + + NpgsqlModelBuilderExtensions.UseIdentityByDefaultColumns(modelBuilder); + + modelBuilder.Entity("JobsMedical.Web.Models.Application", b => + { + b.Property("Id") + .ValueGeneratedOnAdd() + .HasColumnType("integer"); + + NpgsqlPropertyBuilderExtensions.UseIdentityByDefaultColumn(b.Property("Id")); + + b.Property("CreatedAt") + .HasColumnType("timestamp with time zone"); + + b.Property("DoctorId") + .HasColumnType("integer"); + + b.Property("Message") + .HasMaxLength(500) + .HasColumnType("character varying(500)"); + + b.Property("ShiftId") + .HasColumnType("integer"); + + b.Property("Status") + .HasColumnType("integer"); + + b.HasKey("Id"); + + b.HasIndex("DoctorId"); + + b.HasIndex("ShiftId", "DoctorId") + .IsUnique(); + + b.ToTable("Applications"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.City", b => + { + b.Property("Id") + .ValueGeneratedOnAdd() + .HasColumnType("integer"); + + NpgsqlPropertyBuilderExtensions.UseIdentityByDefaultColumn(b.Property("Id")); + + b.Property("IsActive") + .HasColumnType("boolean"); + + b.Property("Name") + .IsRequired() + .HasMaxLength(100) + .HasColumnType("character varying(100)"); + + b.Property("Province") + .IsRequired() + .HasMaxLength(100) + .HasColumnType("character varying(100)"); + + b.HasKey("Id"); + + b.ToTable("Cities"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.District", b => + { + b.Property("Id") + .ValueGeneratedOnAdd() + .HasColumnType("integer"); + + NpgsqlPropertyBuilderExtensions.UseIdentityByDefaultColumn(b.Property("Id")); + + b.Property("CityId") + .HasColumnType("integer"); + + b.Property("IsActive") + .HasColumnType("boolean"); + + b.Property("Name") + .IsRequired() + .HasMaxLength(120) + .HasColumnType("character varying(120)"); + + b.HasKey("Id"); + + b.HasIndex("CityId"); + + b.ToTable("Districts"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.DoctorProfile", b => + { + b.Property("Id") + .ValueGeneratedOnAdd() + .HasColumnType("integer"); + + NpgsqlPropertyBuilderExtensions.UseIdentityByDefaultColumn(b.Property("Id")); + + b.Property("Bio") + .HasMaxLength(1000) + .HasColumnType("character varying(1000)"); + + b.Property("CityId") + .HasColumnType("integer"); + + b.Property("IsVerified") + .HasColumnType("boolean"); + + b.Property("LicenseNo") + .HasMaxLength(20) + .HasColumnType("character varying(20)"); + + b.Property("RoleId") + .HasColumnType("integer"); + + b.Property("Specialty") + .IsRequired() + .HasMaxLength(100) + .HasColumnType("character varying(100)"); + + b.Property("UserId") + .HasColumnType("integer"); + + b.Property("YearsExperience") + .HasColumnType("integer"); + + b.HasKey("Id"); + + b.HasIndex("CityId"); + + b.HasIndex("RoleId"); + + b.HasIndex("UserId") + .IsUnique(); + + b.ToTable("DoctorProfiles"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.Facility", b => + { + b.Property("Id") + .ValueGeneratedOnAdd() + .HasColumnType("integer"); + + NpgsqlPropertyBuilderExtensions.UseIdentityByDefaultColumn(b.Property("Id")); + + b.Property("Address") + .HasMaxLength(500) + .HasColumnType("character varying(500)"); + + b.Property("BaleId") + .HasMaxLength(50) + .HasColumnType("character varying(50)"); + + b.Property("CityId") + .HasColumnType("integer"); + + b.Property("CreatedAt") + .HasColumnType("timestamp with time zone"); + + b.Property("DistrictId") + .HasColumnType("integer"); + + b.Property("IsVerified") + .HasColumnType("boolean"); + + b.Property("Lat") + .HasColumnType("double precision"); + + b.Property("Lng") + .HasColumnType("double precision"); + + b.Property("Name") + .IsRequired() + .HasMaxLength(200) + .HasColumnType("character varying(200)"); + + b.Property("OwnerUserId") + .HasColumnType("integer"); + + b.Property("Phone") + .HasMaxLength(20) + .HasColumnType("character varying(20)"); + + b.Property("Type") + .HasColumnType("integer"); + + b.HasKey("Id"); + + b.HasIndex("CityId"); + + b.HasIndex("DistrictId"); + + b.HasIndex("OwnerUserId"); + + b.ToTable("Facilities"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.InterestEvent", b => + { + b.Property("Id") + .ValueGeneratedOnAdd() + .HasColumnType("bigint"); + + NpgsqlPropertyBuilderExtensions.UseIdentityByDefaultColumn(b.Property("Id")); + + b.Property("CreatedAt") + .HasColumnType("timestamp with time zone"); + + b.Property("EventType") + .HasColumnType("integer"); + + b.Property("JobOpeningId") + .HasColumnType("integer"); + + b.Property("ShiftId") + .HasColumnType("integer"); + + b.Property("VisitorId") + .IsRequired() + .HasColumnType("character varying(36)"); + + b.HasKey("Id"); + + b.HasIndex("JobOpeningId"); + + b.HasIndex("ShiftId"); + + b.HasIndex("VisitorId", "CreatedAt"); + + b.ToTable("InterestEvents"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.JobOpening", b => + { + b.Property("Id") + .ValueGeneratedOnAdd() + .HasColumnType("integer"); + + NpgsqlPropertyBuilderExtensions.UseIdentityByDefaultColumn(b.Property("Id")); + + b.Property("CreatedAt") + .HasColumnType("timestamp with time zone"); + + b.Property("Description") + .HasMaxLength(2000) + .HasColumnType("character varying(2000)"); + + b.Property("EmploymentType") + .HasColumnType("integer"); + + b.Property("FacilityId") + .HasColumnType("integer"); + + b.Property("Requirements") + .HasMaxLength(1000) + .HasColumnType("character varying(1000)"); + + b.Property("RoleId") + .HasColumnType("integer"); + + b.Property("SalaryMax") + .HasColumnType("bigint"); + + b.Property("SalaryMin") + .HasColumnType("bigint"); + + b.Property("Source") + .HasColumnType("integer"); + + b.Property("SourceUrl") + .HasMaxLength(500) + .HasColumnType("character varying(500)"); + + b.Property("Status") + .HasColumnType("integer"); + + b.Property("Title") + .IsRequired() + .HasMaxLength(200) + .HasColumnType("character varying(200)"); + + b.HasKey("Id"); + + b.HasIndex("FacilityId"); + + b.HasIndex("RoleId"); + + b.HasIndex("Status"); + + b.ToTable("JobOpenings"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.RawListing", b => + { + b.Property("Id") + .ValueGeneratedOnAdd() + .HasColumnType("integer"); + + NpgsqlPropertyBuilderExtensions.UseIdentityByDefaultColumn(b.Property("Id")); + + b.Property("Confidence") + .HasColumnType("integer"); + + b.Property("ContentHash") + .HasMaxLength(64) + .HasColumnType("character varying(64)"); + + b.Property("FetchedAt") + .HasColumnType("timestamp with time zone"); + + b.Property("LinkedShiftId") + .HasColumnType("integer"); + + b.Property("ParsedJson") + .HasColumnType("text"); + + b.Property("RawText") + .IsRequired() + .HasColumnType("text"); + + b.Property("SourceChannel") + .IsRequired() + .HasMaxLength(200) + .HasColumnType("character varying(200)"); + + b.Property("SourceUrl") + .HasMaxLength(500) + .HasColumnType("character varying(500)"); + + b.Property("Status") + .HasColumnType("integer"); + + b.Property("ValidationNotes") + .HasMaxLength(1000) + .HasColumnType("character varying(1000)"); + + b.HasKey("Id"); + + b.HasIndex("ContentHash"); + + b.HasIndex("LinkedShiftId"); + + b.HasIndex("Status"); + + b.ToTable("RawListings"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.Role", b => + { + b.Property("Id") + .ValueGeneratedOnAdd() + .HasColumnType("integer"); + + NpgsqlPropertyBuilderExtensions.UseIdentityByDefaultColumn(b.Property("Id")); + + b.Property("Category") + .IsRequired() + .HasMaxLength(50) + .HasColumnType("character varying(50)"); + + b.Property("IsActive") + .HasColumnType("boolean"); + + b.Property("Name") + .IsRequired() + .HasMaxLength(100) + .HasColumnType("character varying(100)"); + + b.Property("SortOrder") + .HasColumnType("integer"); + + b.HasKey("Id"); + + b.ToTable("Roles"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.Shift", b => + { + b.Property("Id") + .ValueGeneratedOnAdd() + .HasColumnType("integer"); + + NpgsqlPropertyBuilderExtensions.UseIdentityByDefaultColumn(b.Property("Id")); + + b.Property("CreatedAt") + .HasColumnType("timestamp with time zone"); + + b.Property("Date") + .HasColumnType("date"); + + b.Property("Description") + .HasMaxLength(1500) + .HasColumnType("character varying(1500)"); + + b.Property("EndTime") + .HasColumnType("time without time zone"); + + b.Property("FacilityId") + .HasColumnType("integer"); + + b.Property("PayAmount") + .HasColumnType("bigint"); + + b.Property("PayType") + .HasColumnType("integer"); + + b.Property("RoleId") + .HasColumnType("integer"); + + b.Property("SharePercent") + .HasColumnType("integer"); + + b.Property("ShiftType") + .HasColumnType("integer"); + + b.Property("Source") + .HasColumnType("integer"); + + b.Property("SourceUrl") + .HasMaxLength(500) + .HasColumnType("character varying(500)"); + + b.Property("SpecialtyRequired") + .IsRequired() + .HasMaxLength(100) + .HasColumnType("character varying(100)"); + + b.Property("StartTime") + .HasColumnType("time without time zone"); + + b.Property("Status") + .HasColumnType("integer"); + + b.HasKey("Id"); + + b.HasIndex("FacilityId"); + + b.HasIndex("RoleId"); + + b.HasIndex("Date", "Status"); + + b.ToTable("Shifts"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.User", b => + { + b.Property("Id") + .ValueGeneratedOnAdd() + .HasColumnType("integer"); + + NpgsqlPropertyBuilderExtensions.UseIdentityByDefaultColumn(b.Property("Id")); + + b.Property("CreatedAt") + .HasColumnType("timestamp with time zone"); + + b.Property("FullName") + .HasMaxLength(150) + .HasColumnType("character varying(150)"); + + b.Property("IsPhoneVerified") + .HasColumnType("boolean"); + + b.Property("Phone") + .IsRequired() + .HasMaxLength(20) + .HasColumnType("character varying(20)"); + + b.Property("Role") + .HasColumnType("integer"); + + b.HasKey("Id"); + + b.HasIndex("Phone") + .IsUnique(); + + b.ToTable("Users"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.UserPreferences", b => + { + b.Property("Id") + .ValueGeneratedOnAdd() + .HasColumnType("integer"); + + NpgsqlPropertyBuilderExtensions.UseIdentityByDefaultColumn(b.Property("Id")); + + b.Property("CityId") + .HasColumnType("integer"); + + b.Property("MinPay") + .HasColumnType("bigint"); + + b.Property("PreferredShiftType") + .HasColumnType("integer"); + + b.Property("RoleId") + .HasColumnType("integer"); + + b.Property("UpdatedAt") + .HasColumnType("timestamp with time zone"); + + b.Property("VisitorId") + .IsRequired() + .HasColumnType("character varying(36)"); + + b.HasKey("Id"); + + b.HasIndex("CityId"); + + b.HasIndex("RoleId"); + + b.HasIndex("VisitorId") + .IsUnique(); + + b.ToTable("UserPreferences"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.Visitor", b => + { + b.Property("Id") + .HasMaxLength(36) + .HasColumnType("character varying(36)"); + + b.Property("CreatedAt") + .HasColumnType("timestamp with time zone"); + + b.Property("LastSeenAt") + .HasColumnType("timestamp with time zone"); + + b.Property("UserId") + .HasColumnType("integer"); + + b.HasKey("Id"); + + b.HasIndex("UserId"); + + b.ToTable("Visitors"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.Application", b => + { + b.HasOne("JobsMedical.Web.Models.User", "Doctor") + .WithMany("Applications") + .HasForeignKey("DoctorId") + .OnDelete(DeleteBehavior.Cascade) + .IsRequired(); + + b.HasOne("JobsMedical.Web.Models.Shift", "Shift") + .WithMany("Applications") + .HasForeignKey("ShiftId") + .OnDelete(DeleteBehavior.Cascade) + .IsRequired(); + + b.Navigation("Doctor"); + + b.Navigation("Shift"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.District", b => + { + b.HasOne("JobsMedical.Web.Models.City", "City") + .WithMany() + .HasForeignKey("CityId") + .OnDelete(DeleteBehavior.Cascade) + .IsRequired(); + + b.Navigation("City"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.DoctorProfile", b => + { + b.HasOne("JobsMedical.Web.Models.City", "City") + .WithMany() + .HasForeignKey("CityId"); + + b.HasOne("JobsMedical.Web.Models.Role", "Role") + .WithMany() + .HasForeignKey("RoleId"); + + b.HasOne("JobsMedical.Web.Models.User", "User") + .WithOne("DoctorProfile") + .HasForeignKey("JobsMedical.Web.Models.DoctorProfile", "UserId") + .OnDelete(DeleteBehavior.Cascade) + .IsRequired(); + + b.Navigation("City"); + + b.Navigation("Role"); + + b.Navigation("User"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.Facility", b => + { + b.HasOne("JobsMedical.Web.Models.City", "City") + .WithMany("Facilities") + .HasForeignKey("CityId") + .OnDelete(DeleteBehavior.Cascade) + .IsRequired(); + + b.HasOne("JobsMedical.Web.Models.District", "District") + .WithMany("Facilities") + .HasForeignKey("DistrictId") + .OnDelete(DeleteBehavior.SetNull); + + b.HasOne("JobsMedical.Web.Models.User", "OwnerUser") + .WithMany() + .HasForeignKey("OwnerUserId") + .OnDelete(DeleteBehavior.SetNull); + + b.Navigation("City"); + + b.Navigation("District"); + + b.Navigation("OwnerUser"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.InterestEvent", b => + { + b.HasOne("JobsMedical.Web.Models.JobOpening", "JobOpening") + .WithMany() + .HasForeignKey("JobOpeningId") + .OnDelete(DeleteBehavior.Cascade); + + b.HasOne("JobsMedical.Web.Models.Shift", "Shift") + .WithMany() + .HasForeignKey("ShiftId") + .OnDelete(DeleteBehavior.Cascade); + + b.HasOne("JobsMedical.Web.Models.Visitor", "Visitor") + .WithMany("Events") + .HasForeignKey("VisitorId") + .OnDelete(DeleteBehavior.Cascade) + .IsRequired(); + + b.Navigation("JobOpening"); + + b.Navigation("Shift"); + + b.Navigation("Visitor"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.JobOpening", b => + { + b.HasOne("JobsMedical.Web.Models.Facility", "Facility") + .WithMany() + .HasForeignKey("FacilityId") + .OnDelete(DeleteBehavior.Cascade) + .IsRequired(); + + b.HasOne("JobsMedical.Web.Models.Role", "Role") + .WithMany() + .HasForeignKey("RoleId") + .OnDelete(DeleteBehavior.Restrict) + .IsRequired(); + + b.Navigation("Facility"); + + b.Navigation("Role"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.RawListing", b => + { + b.HasOne("JobsMedical.Web.Models.Shift", "LinkedShift") + .WithMany() + .HasForeignKey("LinkedShiftId"); + + b.Navigation("LinkedShift"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.Shift", b => + { + b.HasOne("JobsMedical.Web.Models.Facility", "Facility") + .WithMany("Shifts") + .HasForeignKey("FacilityId") + .OnDelete(DeleteBehavior.Cascade) + .IsRequired(); + + b.HasOne("JobsMedical.Web.Models.Role", "Role") + .WithMany("Shifts") + .HasForeignKey("RoleId") + .OnDelete(DeleteBehavior.Restrict) + .IsRequired(); + + b.Navigation("Facility"); + + b.Navigation("Role"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.UserPreferences", b => + { + b.HasOne("JobsMedical.Web.Models.City", "City") + .WithMany() + .HasForeignKey("CityId"); + + b.HasOne("JobsMedical.Web.Models.Role", "Role") + .WithMany() + .HasForeignKey("RoleId"); + + b.HasOne("JobsMedical.Web.Models.Visitor", "Visitor") + .WithOne("Preferences") + .HasForeignKey("JobsMedical.Web.Models.UserPreferences", "VisitorId") + .OnDelete(DeleteBehavior.Cascade) + .IsRequired(); + + b.Navigation("City"); + + b.Navigation("Role"); + + b.Navigation("Visitor"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.Visitor", b => + { + b.HasOne("JobsMedical.Web.Models.User", "User") + .WithMany() + .HasForeignKey("UserId") + .OnDelete(DeleteBehavior.SetNull); + + b.Navigation("User"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.City", b => + { + b.Navigation("Facilities"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.District", b => + { + b.Navigation("Facilities"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.Facility", b => + { + b.Navigation("Shifts"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.Role", b => + { + b.Navigation("Shifts"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.Shift", b => + { + b.Navigation("Applications"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.User", b => + { + b.Navigation("Applications"); + + b.Navigation("DoctorProfile"); + }); + + modelBuilder.Entity("JobsMedical.Web.Models.Visitor", b => + { + b.Navigation("Events"); + + b.Navigation("Preferences"); + }); +#pragma warning restore 612, 618 + } + } +} diff --git a/src/JobsMedical.Web/Migrations/20260603044159_IngestionFields.cs b/src/JobsMedical.Web/Migrations/20260603044159_IngestionFields.cs new file mode 100644 index 0000000..debc151 --- /dev/null +++ b/src/JobsMedical.Web/Migrations/20260603044159_IngestionFields.cs @@ -0,0 +1,69 @@ +using Microsoft.EntityFrameworkCore.Migrations; + +#nullable disable + +namespace JobsMedical.Web.Migrations +{ + /// + public partial class IngestionFields : Migration + { + /// + protected override void Up(MigrationBuilder migrationBuilder) + { + migrationBuilder.AddColumn( + name: "Confidence", + table: "RawListings", + type: "integer", + nullable: false, + defaultValue: 0); + + migrationBuilder.AddColumn( + name: "ContentHash", + table: "RawListings", + type: "character varying(64)", + maxLength: 64, + nullable: true); + + migrationBuilder.AddColumn( + name: "ValidationNotes", + table: "RawListings", + type: "character varying(1000)", + maxLength: 1000, + nullable: true); + + migrationBuilder.CreateIndex( + name: "IX_RawListings_ContentHash", + table: "RawListings", + column: "ContentHash"); + + migrationBuilder.CreateIndex( + name: "IX_RawListings_Status", + table: "RawListings", + column: "Status"); + } + + /// + protected override void Down(MigrationBuilder migrationBuilder) + { + migrationBuilder.DropIndex( + name: "IX_RawListings_ContentHash", + table: "RawListings"); + + migrationBuilder.DropIndex( + name: "IX_RawListings_Status", + table: "RawListings"); + + migrationBuilder.DropColumn( + name: "Confidence", + table: "RawListings"); + + migrationBuilder.DropColumn( + name: "ContentHash", + table: "RawListings"); + + migrationBuilder.DropColumn( + name: "ValidationNotes", + table: "RawListings"); + } + } +} diff --git a/src/JobsMedical.Web/Migrations/AppDbContextModelSnapshot.cs b/src/JobsMedical.Web/Migrations/AppDbContextModelSnapshot.cs index e53a23f..c2e27ac 100644 --- a/src/JobsMedical.Web/Migrations/AppDbContextModelSnapshot.cs +++ b/src/JobsMedical.Web/Migrations/AppDbContextModelSnapshot.cs @@ -319,6 +319,13 @@ namespace JobsMedical.Web.Migrations NpgsqlPropertyBuilderExtensions.UseIdentityByDefaultColumn(b.Property("Id")); + b.Property("Confidence") + .HasColumnType("integer"); + + b.Property("ContentHash") + .HasMaxLength(64) + .HasColumnType("character varying(64)"); + b.Property("FetchedAt") .HasColumnType("timestamp with time zone"); @@ -344,10 +351,18 @@ namespace JobsMedical.Web.Migrations b.Property("Status") .HasColumnType("integer"); + b.Property("ValidationNotes") + .HasMaxLength(1000) + .HasColumnType("character varying(1000)"); + b.HasKey("Id"); + b.HasIndex("ContentHash"); + b.HasIndex("LinkedShiftId"); + b.HasIndex("Status"); + b.ToTable("RawListings"); }); diff --git a/src/JobsMedical.Web/Models/Enums.cs b/src/JobsMedical.Web/Models/Enums.cs index 08a8fdd..6bc7fb0 100644 --- a/src/JobsMedical.Web/Models/Enums.cs +++ b/src/JobsMedical.Web/Models/Enums.cs @@ -55,9 +55,10 @@ public enum ApplicationStatus public enum RawListingStatus { - New = 0, // جدید - Normalized = 1, // تبدیل شده به شیفت - Discarded = 2 // کنار گذاشته شده + New = 0, // جدید (آماده بررسی) + Normalized = 1, // تبدیل شده به شیفت/استخدام + Discarded = 2, // کنار گذاشته شده (یا اسپم) + Flagged = 3 // ناقص/مشکوک — نیازمند بررسی دستی بیشتر } public enum EmploymentType diff --git a/src/JobsMedical.Web/Models/RawListing.cs b/src/JobsMedical.Web/Models/RawListing.cs index 37d8b2a..f6fbfce 100644 --- a/src/JobsMedical.Web/Models/RawListing.cs +++ b/src/JobsMedical.Web/Models/RawListing.cs @@ -27,5 +27,16 @@ public class RawListing [MaxLength(500)] public string? SourceUrl { get; set; } + /// SHA-256 of the normalized text — used to dedupe across ingestion runs. + [MaxLength(64)] + public string? ContentHash { get; set; } + + /// Parser+validator confidence 0–100 (how complete/usable the listing looks). + public int Confidence { get; set; } + + /// Human-readable validation findings (missing fields, spam flags, etc.). + [MaxLength(1000)] + public string? ValidationNotes { get; set; } + public DateTime FetchedAt { get; set; } = DateTime.UtcNow; } diff --git a/src/JobsMedical.Web/Pages/Admin/Index.cshtml b/src/JobsMedical.Web/Pages/Admin/Index.cshtml index 4671dac..bbbee0a 100644 --- a/src/JobsMedical.Web/Pages/Admin/Index.cshtml +++ b/src/JobsMedical.Web/Pages/Admin/Index.cshtml @@ -6,29 +6,55 @@
-

پنل مدیریت — صف آگهی‌های خام

+

پنل مدیریت — جمع‌آوری و صف آگهی‌ها

- آگهی‌های جمع‌آوری‌شده از کانال‌ها را اینجا بررسی، ساختارمند و منتشر کن. - (@JalaliDate.ToPersianDigits(Model.Queue.Count.ToString()) در انتظار بررسی) + آگهی‌های جمع‌آوری‌شده از منابع را بررسی، ساختارمند و منتشر کن. + (@JalaliDate.ToPersianDigits(Model.Queue.Count.ToString()) در صف، + @JalaliDate.ToPersianDigits(Model.Flagged.Count.ToString()) پرچم‌خورده) · تأیید مراکز درمانی

+ @if (Model.IngestMessage is not null) + { +
✓ @Model.IngestMessage
+ } +
+

صف بررسی

@if (Model.Queue.Count == 0) { -
صف خالی است. آگهی جدیدی برای بررسی وجود ندارد.
+
صف خالی است. «اجرای جمع‌آوری» را بزن یا آگهی اضافه کن.
} else { foreach (var r in Model.Queue) { -
-
- @r.SourceChannel - @JalaliDate.ToLongDate(DateOnly.FromDateTime(r.FetchedAt)) -
-

@r.RawText

- بررسی و انتشار ← -
+ + } + } + + @if (Model.Flagged.Count > 0) + { +

پرچم‌خورده (ناقص/مشکوک)

+

اعتبارسنجی این‌ها را کامل ندانست؛ در صورت صحت می‌توانی منتشرشان کنی.

+ foreach (var r in Model.Flagged) + { + } }
diff --git a/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs b/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs index 7ecdb9a..a790404 100644 --- a/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs +++ b/src/JobsMedical.Web/Pages/Admin/Index.cshtml.cs @@ -1,5 +1,6 @@ using JobsMedical.Web.Data; using JobsMedical.Web.Models; +using JobsMedical.Web.Services.Scraping; using Microsoft.AspNetCore.Authorization; using Microsoft.AspNetCore.Mvc; using Microsoft.AspNetCore.Mvc.RazorPages; @@ -7,19 +8,29 @@ using Microsoft.EntityFrameworkCore; namespace JobsMedical.Web.Pages.Admin; -[Authorize(Roles = "Admin")] // secured by the OTP-auth Admin role +[Authorize(Roles = "Admin")] public class IndexModel : PageModel { private readonly AppDbContext _db; - public IndexModel(AppDbContext db) => _db = db; + private readonly IngestionService _ingest; + + public IndexModel(AppDbContext db, IngestionService ingest) + { + _db = db; + _ingest = ingest; + } public List Queue { get; private set; } = new(); + public List Flagged { get; private set; } = new(); + public IReadOnlyList<(string Name, bool Enabled)> Sources { get; private set; } = new List<(string, bool)>(); public int PublishedShifts { get; private set; } public int PublishedJobs { get; private set; } [BindProperty] public string? SourceChannel { get; set; } [BindProperty] public string? RawText { get; set; } + [TempData] public string? IngestMessage { get; set; } + public async Task OnGetAsync() => await LoadAsync(); public async Task OnPostAddAsync() @@ -37,11 +48,23 @@ public class IndexModel : PageModel return RedirectToPage(); } + public async Task OnPostRunIngestionAsync() + { + var s = await _ingest.RunAsync(); + IngestMessage = $"جمع‌آوری انجام شد — {s.TotalQueued} در صف، {s.TotalFlagged} پرچم‌خورده، " + + $"{s.TotalSpam} اسپم، {s.TotalDuplicates} تکراری."; + return RedirectToPage(); + } + private async Task LoadAsync() { Queue = await _db.RawListings .Where(r => r.Status == RawListingStatus.New) + .OrderByDescending(r => r.Confidence).ThenByDescending(r => r.FetchedAt).ToListAsync(); + Flagged = await _db.RawListings + .Where(r => r.Status == RawListingStatus.Flagged) .OrderByDescending(r => r.FetchedAt).ToListAsync(); + Sources = _ingest.Sources; PublishedShifts = await _db.Shifts.CountAsync(s => s.Source != ShiftSource.Direct); PublishedJobs = await _db.JobOpenings.CountAsync(); } diff --git a/src/JobsMedical.Web/Pages/Shared/_HourBar.cshtml b/src/JobsMedical.Web/Pages/Shared/_HourBar.cshtml new file mode 100644 index 0000000..ba41a65 --- /dev/null +++ b/src/JobsMedical.Web/Pages/Shared/_HourBar.cshtml @@ -0,0 +1,42 @@ +@model JobsMedical.Web.Models.Shift +@using System.Globalization +@{ + var s = Model; + var ci = CultureInfo.InvariantCulture; + int sm = s.StartTime.Hour * 60 + s.StartTime.Minute; + int em = s.EndTime.Hour * 60 + s.EndTime.Minute; + var typeClass = s.ShiftType switch + { + ShiftType.Day => "day", + ShiftType.Evening => "evening", + ShiftType.Night => "night", + _ => "oncall", + }; + + // Build one or two segments (overnight shifts wrap past midnight). On-call = whole day. + var segs = new List<(double left, double width)>(); + if (s.ShiftType == ShiftType.OnCall || em == sm) + segs.Add((0, 100)); + else if (em > sm) + segs.Add((sm / 1440.0 * 100, (em - sm) / 1440.0 * 100)); + else + { + segs.Add((sm / 1440.0 * 100, (1440 - sm) / 1440.0 * 100)); + segs.Add((0, em / 1440.0 * 100)); + } + string Pct(double v) => v.ToString("0.##", ci); +} +
+
+ + + + @foreach (var seg in segs) + { + + } +
+
+ ۰۶۱۲۱۸۲۴ +
+
diff --git a/src/JobsMedical.Web/Pages/Shared/_RawListingRow.cshtml b/src/JobsMedical.Web/Pages/Shared/_RawListingRow.cshtml new file mode 100644 index 0000000..307cf19 --- /dev/null +++ b/src/JobsMedical.Web/Pages/Shared/_RawListingRow.cshtml @@ -0,0 +1,20 @@ +@model JobsMedical.Web.Models.RawListing +@{ + var c = Model.Confidence; + var confClass = c >= 70 ? "badge-verified" : c >= 50 ? "badge-day" : "badge-type"; +} +
+
+ @Model.SourceChannel + + اطمینان @JalaliDate.ToPersianDigits(c.ToString())٪ + @JalaliDate.ToLongDate(DateOnly.FromDateTime(Model.FetchedAt)) + +
+

@Model.RawText

+ @if (!string.IsNullOrEmpty(Model.ValidationNotes)) + { +

⚠ @Model.ValidationNotes

+ } + بررسی و انتشار ← +
diff --git a/src/JobsMedical.Web/Pages/Shared/_RecommendationCard.cshtml b/src/JobsMedical.Web/Pages/Shared/_RecommendationCard.cshtml index 763905c..fbe1448 100644 --- a/src/JobsMedical.Web/Pages/Shared/_RecommendationCard.cshtml +++ b/src/JobsMedical.Web/Pages/Shared/_RecommendationCard.cshtml @@ -22,6 +22,7 @@ 📍 @s.Facility?.City?.Name
📅 @JalaliDate.WeekDayName(s.Date)، @JalaliDate.ToLongDate(s.Date) — 🕐 @JalaliDate.Time(s.StartTime)
+ @* The "why" — what makes a pattern engine trustworthy: every pick is explained. *@
diff --git a/src/JobsMedical.Web/Pages/Shared/_ShiftCard.cshtml b/src/JobsMedical.Web/Pages/Shared/_ShiftCard.cshtml index 4d07d22..0bcade4 100644 --- a/src/JobsMedical.Web/Pages/Shared/_ShiftCard.cshtml +++ b/src/JobsMedical.Web/Pages/Shared/_ShiftCard.cshtml @@ -30,6 +30,7 @@ }
📅 @JalaliDate.WeekDayName(Model.Date)، @JalaliDate.ToLongDate(Model.Date)
🕐 @JalaliDate.Time(Model.StartTime) تا @JalaliDate.Time(Model.EndTime)
+
@JalaliDate.PayLabel(Model.PayType, Model.PayAmount, Model.SharePercent) جزئیات diff --git a/src/JobsMedical.Web/Pages/Shifts/Details.cshtml b/src/JobsMedical.Web/Pages/Shifts/Details.cshtml index 64c7a97..3897996 100644 --- a/src/JobsMedical.Web/Pages/Shifts/Details.cshtml +++ b/src/JobsMedical.Web/Pages/Shifts/Details.cshtml @@ -50,6 +50,10 @@
مدت@JalaliDate.ToPersianDigits(s.DurationHours.ToString("0.#")) ساعت
نقش مورد نیاز@(s.Role?.Name ?? s.SpecialtyRequired)
پرداخت@JalaliDate.PayLabel(s.PayType, s.PayAmount, s.SharePercent)
+
+ بازه ساعت کاری در شبانه‌روز + +
@if (!string.IsNullOrEmpty(s.Description)) diff --git a/src/JobsMedical.Web/Program.cs b/src/JobsMedical.Web/Program.cs index 8115607..0c8c847 100644 --- a/src/JobsMedical.Web/Program.cs +++ b/src/JobsMedical.Web/Program.cs @@ -21,6 +21,23 @@ builder.Services.AddScoped(); // Listing parser: heuristic now; swap for an LLM-backed IListingParser later. builder.Services.AddSingleton(); +// Scrape/ingestion engine: pluggable sources → dedupe → parse → validate → review queue. +builder.Services.Configure( + builder.Configuration.GetSection("Ingestion")); +builder.Services.Configure( + builder.Configuration.GetSection("Ingestion:Telegram")); +builder.Services.Configure( + builder.Configuration.GetSection("Ingestion:Divar")); +builder.Services.AddSingleton(); +builder.Services.AddSingleton(); +builder.Services.AddSingleton(); +builder.Services.AddSingleton(); +builder.Services.AddScoped(); +builder.Services.AddHostedService(); + // Phone-OTP cookie auth. builder.Services.AddAuthentication(CookieAuthenticationDefaults.AuthenticationScheme) .AddCookie(o => diff --git a/src/JobsMedical.Web/Services/Scraping/DivarListingSource.cs b/src/JobsMedical.Web/Services/Scraping/DivarListingSource.cs new file mode 100644 index 0000000..ce537bf --- /dev/null +++ b/src/JobsMedical.Web/Services/Scraping/DivarListingSource.cs @@ -0,0 +1,42 @@ +using Microsoft.Extensions.Options; + +namespace JobsMedical.Web.Services.Scraping; + +public class DivarOptions +{ + public bool Enabled { get; set; } + public string? City { get; set; } // e.g. "tehran" + public List Queries { get; set; } = new(); // search terms, e.g. "استخدام پزشک" +} + +/// +/// Divar source. Credential-ready: configure city + queries in (Ingestion:Divar) and implement +/// the fetch against Divar's listing API/HTML. Dormant until enabled. +/// +public class DivarListingSource : IListingSource +{ + private readonly DivarOptions _opts; + private readonly ILogger _log; + + public DivarListingSource(IOptions opts, ILogger log) + { + _opts = opts.Value; + _log = log; + } + + public string Name => "دیوار"; + public bool Enabled => _opts.Enabled && _opts.Queries.Count > 0; + + public Task> FetchAsync(CancellationToken ct = default) + { + if (!Enabled) + { + _log.LogInformation("Divar source not configured — skipping."); + return Task.FromResult>(Array.Empty()); + } + // TODO(prod): query Divar for each term in the configured city, map each ad's + // title+description to new ScrapedItem(Name, text, adUrl). + _log.LogWarning("Divar fetch not yet implemented; returning empty."); + return Task.FromResult>(Array.Empty()); + } +} diff --git a/src/JobsMedical.Web/Services/Scraping/IListingSource.cs b/src/JobsMedical.Web/Services/Scraping/IListingSource.cs new file mode 100644 index 0000000..8079fa2 --- /dev/null +++ b/src/JobsMedical.Web/Services/Scraping/IListingSource.cs @@ -0,0 +1,15 @@ +namespace JobsMedical.Web.Services.Scraping; + +/// One raw post pulled from a source (a Telegram message, a Divar ad, etc.). +public record ScrapedItem(string Source, string RawText, string? SourceUrl = null); + +/// +/// A pluggable source the ingestion engine pulls from. Implement once per channel/site. +/// `Enabled` lets a source be present but dormant until it's configured with credentials. +/// +public interface IListingSource +{ + string Name { get; } + bool Enabled { get; } + Task> FetchAsync(CancellationToken ct = default); +} diff --git a/src/JobsMedical.Web/Services/Scraping/IngestionService.cs b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs new file mode 100644 index 0000000..65356d5 --- /dev/null +++ b/src/JobsMedical.Web/Services/Scraping/IngestionService.cs @@ -0,0 +1,107 @@ +using System.Security.Cryptography; +using System.Text; +using System.Text.RegularExpressions; +using JobsMedical.Web.Data; +using JobsMedical.Web.Models; +using Microsoft.EntityFrameworkCore; + +namespace JobsMedical.Web.Services.Scraping; + +public record SourceResult(string Source, int Fetched, int Queued, int Flagged, int Spam, int Duplicates); + +public record IngestionSummary(List Sources) +{ + public int TotalQueued => Sources.Sum(s => s.Queued); + public int TotalFlagged => Sources.Sum(s => s.Flagged); + public int TotalSpam => Sources.Sum(s => s.Spam); + public int TotalDuplicates => Sources.Sum(s => s.Duplicates); +} + +/// +/// The scrape engine. Pulls from every enabled , dedupes by content +/// hash, parses with , validates with , +/// and stores each as a with a status: New (queued for review), +/// Flagged (incomplete/suspicious), or Discarded (spam). Source-agnostic — add a source and it +/// flows through unchanged. +/// +public class IngestionService +{ + private readonly AppDbContext _db; + private readonly IEnumerable _sources; + private readonly IListingParser _parser; + private readonly ListingValidator _validator; + private readonly ILogger _log; + + public IngestionService(AppDbContext db, IEnumerable sources, + IListingParser parser, ListingValidator validator, ILogger log) + { + _db = db; + _sources = sources; + _parser = parser; + _validator = validator; + _log = log; + } + + public IReadOnlyList<(string Name, bool Enabled)> Sources => + _sources.Select(s => (s.Name, s.Enabled)).ToList(); + + public async Task RunAsync(CancellationToken ct = default) + { + var roles = await _db.Roles.Select(r => r.Name).ToListAsync(ct); + var cities = await _db.Cities.Select(c => c.Name).ToListAsync(ct); + var districts = await _db.Districts.Select(d => d.Name).ToListAsync(ct); + + var results = new List(); + + foreach (var source in _sources.Where(s => s.Enabled)) + { + int fetched = 0, queued = 0, flagged = 0, spam = 0, dupes = 0; + IReadOnlyList items; + try { items = await source.FetchAsync(ct); } + catch (Exception ex) { _log.LogError(ex, "Source {Source} fetch failed", source.Name); continue; } + + foreach (var item in items) + { + fetched++; + var hash = Hash(item.RawText); + if (await _db.RawListings.AnyAsync(r => r.ContentHash == hash, ct)) { dupes++; continue; } + + var parsed = _parser.Parse(item.RawText, roles, cities, districts); + var val = _validator.Validate(item.RawText, parsed); + + var status = val.IsSpam ? RawListingStatus.Discarded + : val.IsValid ? RawListingStatus.New + : RawListingStatus.Flagged; + if (status == RawListingStatus.New) queued++; + else if (status == RawListingStatus.Flagged) flagged++; + else spam++; + + _db.RawListings.Add(new RawListing + { + SourceChannel = item.Source, + SourceUrl = item.SourceUrl, + RawText = item.RawText.Trim(), + ContentHash = hash, + Confidence = val.Confidence, + ValidationNotes = val.Issues.Count > 0 ? string.Join("؛ ", val.Issues) : null, + Status = status, + }); + } + + await _db.SaveChangesAsync(ct); + results.Add(new SourceResult(source.Name, fetched, queued, flagged, spam, dupes)); + _log.LogInformation("Ingestion {Source}: fetched={F} queued={Q} flagged={Fl} spam={S} dupes={D}", + source.Name, fetched, queued, flagged, spam, dupes); + } + + return new IngestionSummary(results); + } + + /// SHA-256 hex of the whitespace-normalized text (for cross-run dedupe). + private static string Hash(string text) + { + var normalized = Regex.Replace((text ?? "").Trim(), @"\s+", " "); + var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(normalized)); + return Convert.ToHexString(bytes).ToLowerInvariant(); + } +} diff --git a/src/JobsMedical.Web/Services/Scraping/IngestionWorker.cs b/src/JobsMedical.Web/Services/Scraping/IngestionWorker.cs new file mode 100644 index 0000000..3ed6173 --- /dev/null +++ b/src/JobsMedical.Web/Services/Scraping/IngestionWorker.cs @@ -0,0 +1,59 @@ +using Microsoft.Extensions.Options; + +namespace JobsMedical.Web.Services.Scraping; + +public class IngestionOptions +{ + public bool Enabled { get; set; } = false; // off by default — opt in via config + public int IntervalMinutes { get; set; } = 30; +} + +/// +/// Periodically runs the ingestion engine when enabled (Ingestion:Enabled=true). Off by default +/// so nothing scrapes uninvited; admins can also trigger a run on demand from the admin UI. +/// +public class IngestionWorker : BackgroundService +{ + private readonly IServiceScopeFactory _scopes; + private readonly IngestionOptions _opts; + private readonly ILogger _log; + + public IngestionWorker(IServiceScopeFactory scopes, IOptions opts, + ILogger log) + { + _scopes = scopes; + _opts = opts.Value; + _log = log; + } + + protected override async Task ExecuteAsync(CancellationToken stoppingToken) + { + if (!_opts.Enabled) + { + _log.LogInformation("Ingestion worker disabled (Ingestion:Enabled=false)."); + return; + } + + var interval = TimeSpan.FromMinutes(Math.Max(1, _opts.IntervalMinutes)); + _log.LogInformation("Ingestion worker on; every {Min} min.", _opts.IntervalMinutes); + + while (!stoppingToken.IsCancellationRequested) + { + try + { + using var scope = _scopes.CreateScope(); + var svc = scope.ServiceProvider.GetRequiredService(); + var summary = await svc.RunAsync(stoppingToken); + _log.LogInformation("Scheduled ingestion: queued={Q} flagged={F} spam={S} dupes={D}", + summary.TotalQueued, summary.TotalFlagged, summary.TotalSpam, summary.TotalDuplicates); + } + catch (Exception ex) when (ex is not OperationCanceledException) + { + _log.LogError(ex, "Scheduled ingestion run failed"); + } + + try { await Task.Delay(interval, stoppingToken); } + catch (OperationCanceledException) { break; } + } + } +} diff --git a/src/JobsMedical.Web/Services/Scraping/ListingValidator.cs b/src/JobsMedical.Web/Services/Scraping/ListingValidator.cs new file mode 100644 index 0000000..4876203 --- /dev/null +++ b/src/JobsMedical.Web/Services/Scraping/ListingValidator.cs @@ -0,0 +1,63 @@ +using System.Text.RegularExpressions; +using JobsMedical.Web.Models; + +namespace JobsMedical.Web.Services.Scraping; + +public record ValidationResult(bool IsValid, bool IsSpam, int Confidence, List Issues); + +/// +/// Scores a parsed listing for completeness and screens out spam. A listing must look like a +/// real medical shift/job (role + a location or pay signal, plausible length, contact) to pass. +/// The confidence drives whether it lands in the review queue (New), gets Flagged for a closer +/// look, or is auto-discarded as spam. +/// +public class ListingValidator +{ + // Posts that smell like ads/scams rather than medical shifts. + private static readonly string[] SpamMarkers = + { + "سرمایه گذاری", "سرمایه‌گذاری", "وام", "ارز دیجیتال", "رمز ارز", "فروش فالوور", + "بک لینک", "تبلیغات", "قرعه کشی", "جایزه", "کازینو", "شرط بندی", "بیت کوین" + }; + + private static readonly string[] MedicalMarkers = + { + "شیفت", "درمانگاه", "بیمارستان", "کلینیک", "پزشک", "پرستار", "ماما", "تکنسین", + "اورژانس", "استخدام", "کادر درمان", "مطب", "آنکال", "کشیک" + }; + + public ValidationResult Validate(string rawText, ParsedListing parsed) + { + var issues = new List(); + var text = rawText ?? ""; + + bool isSpam = SpamMarkers.Any(text.Contains) + && !MedicalMarkers.Any(text.Contains); + if (isSpam) issues.Add("به‌نظر اسپم/تبلیغاتی است"); + + bool looksMedical = MedicalMarkers.Any(text.Contains); + if (!looksMedical) issues.Add("نشانه‌ای از حوزه درمان یافت نشد"); + + int score = 0; + if (parsed.RoleName is not null) score += 30; else issues.Add("نقش مشخص نیست"); + if (parsed.CityName is not null || parsed.DistrictName is not null) score += 20; + else issues.Add("شهر/محل مشخص نیست"); + if (parsed.PayAmount is not null || parsed.SharePercent is not null || parsed.PayNegotiable) + score += 20; else issues.Add("اطلاعات پرداخت یافت نشد"); + if (parsed.Phone is not null) score += 15; else issues.Add("شماره تماس یافت نشد"); + if (parsed.Kind == ListingKind.Shift && parsed.ShiftType is not null) score += 10; + if (looksMedical) score += 5; + + // Sanity on length — a few words isn't a real listing; a wall of text is suspicious. + var len = text.Trim().Length; + if (len < 25) { score -= 20; issues.Add("متن خیلی کوتاه است"); } + if (len > 1500) { score -= 10; issues.Add("متن غیرعادی بلند است"); } + if (Regex.Matches(text, @"https?://").Count >= 3) { score -= 15; issues.Add("لینک‌های متعدد"); } + + score = Math.Clamp(score, 0, 100); + + // Valid enough for the queue if it's medical, not spam, and reasonably complete. + bool isValid = !isSpam && looksMedical && score >= 50; + return new ValidationResult(isValid, isSpam, score, issues); + } +} diff --git a/src/JobsMedical.Web/Services/Scraping/SampleListingSource.cs b/src/JobsMedical.Web/Services/Scraping/SampleListingSource.cs new file mode 100644 index 0000000..7484e6f --- /dev/null +++ b/src/JobsMedical.Web/Services/Scraping/SampleListingSource.cs @@ -0,0 +1,27 @@ +namespace JobsMedical.Web.Services.Scraping; + +/// +/// A built-in source of representative Persian posts (the kind found in shift channels). Always +/// available, needs no credentials — it lets the whole ingestion → validation → review pipeline +/// run and be demoed today, and doubles as a fixture mix of good, incomplete, and spam posts. +/// +public class SampleListingSource : IListingSource +{ + public string Name => "نمونه (کانال آزمایشی)"; + public bool Enabled => true; + + private static readonly string[] Posts = + { + "درمانگاه شبانه‌روزی در سعادت‌آباد نیازمند پزشک عمومی برای شیفت شب، کارانه ۳ میلیون تومان. تماس ۰۹۱۲۳۴۵۶۷۸۹", + "کلینیک تخصصی در تهران به پرستار برای شیفت عصر نیازمند است، ۵۰٪ سهم درآمد. ۰۹۳۵۱۱۱۲۲۳۳", + "استخدام ماما تمام‌وقت در بیمارستان خصوصی، حقوق توافقی. منطقه شهرک غرب.", + "نیازمند تکنسین اتاق عمل جهت همکاری در نارمک، شیفت صبح. ۰۹۱۲۰۰۰۰۰۰۰", + "فروش فالوور و بک لینک ارزان، سرمایه گذاری در ارز دیجیتال با سود تضمینی!", // spam + "پزشک", // too short / incomplete + "بیمارستان آتیه جهت تکمیل کادر درمان به پزشک عمومی مقیم نیازمند است. قرارداد یک‌ساله، حقوق ۴۵ میلیون ماهانه. تهرانپارس.", + }; + + public Task> FetchAsync(CancellationToken ct = default) + => Task.FromResult>( + Posts.Select(p => new ScrapedItem(Name, p)).ToList()); +} diff --git a/src/JobsMedical.Web/Services/Scraping/TelegramListingSource.cs b/src/JobsMedical.Web/Services/Scraping/TelegramListingSource.cs new file mode 100644 index 0000000..5977caa --- /dev/null +++ b/src/JobsMedical.Web/Services/Scraping/TelegramListingSource.cs @@ -0,0 +1,44 @@ +using Microsoft.Extensions.Options; + +namespace JobsMedical.Web.Services.Scraping; + +public class TelegramOptions +{ + public bool Enabled { get; set; } + public string? BotToken { get; set; } + public List Channels { get; set; } = new(); // @channel handles to read +} + +/// +/// Telegram/Bale channel source. Credential-ready: wire a bot token + channel list in config +/// (Ingestion:Telegram) and implement the fetch against the Bot API (getUpdates / channel posts) +/// or a userbot. Dormant until enabled, so the engine runs without it. +/// +public class TelegramListingSource : IListingSource +{ + private readonly TelegramOptions _opts; + private readonly ILogger _log; + + public TelegramListingSource(IOptions opts, ILogger log) + { + _opts = opts.Value; + _log = log; + } + + public string Name => "تلگرام/بله"; + public bool Enabled => _opts.Enabled && !string.IsNullOrWhiteSpace(_opts.BotToken) && _opts.Channels.Count > 0; + + public Task> FetchAsync(CancellationToken ct = default) + { + if (!Enabled) + { + _log.LogInformation("Telegram source not configured — skipping."); + return Task.FromResult>(Array.Empty()); + } + // TODO(prod): call https://api.telegram.org/bot{token}/getUpdates (or channel history), + // map each message to new ScrapedItem(Name, message.Text, messageLink). The validation + + // dedupe pipeline downstream is already source-agnostic. + _log.LogWarning("Telegram fetch not yet implemented; returning empty."); + return Task.FromResult>(Array.Empty()); + } +} diff --git a/src/JobsMedical.Web/appsettings.json b/src/JobsMedical.Web/appsettings.json index 7956808..8fe568b 100644 --- a/src/JobsMedical.Web/appsettings.json +++ b/src/JobsMedical.Web/appsettings.json @@ -11,5 +11,11 @@ }, "Auth": { "AdminPhone": "09120000000" + }, + "Ingestion": { + "Enabled": false, + "IntervalMinutes": 30, + "Telegram": { "Enabled": false, "BotToken": "", "Channels": [] }, + "Divar": { "Enabled": false, "City": "tehran", "Queries": [] } } } diff --git a/src/JobsMedical.Web/wwwroot/css/site.css b/src/JobsMedical.Web/wwwroot/css/site.css index cffc059..bdd2833 100644 --- a/src/JobsMedical.Web/wwwroot/css/site.css +++ b/src/JobsMedical.Web/wwwroot/css/site.css @@ -188,6 +188,20 @@ label { font-size: 13px; } .alert { padding: 12px 16px; border-radius: 10px; margin-bottom: 16px; font-weight: 600; } .alert-success { background: var(--primary-soft); color: var(--primary-dark); } +/* hour-range timeline bar */ +.hourbar-wrap { direction: ltr; margin: 6px 0 2px; } +.hourbar { + position: relative; height: 9px; background: #eef3f6; + border-radius: 999px; overflow: hidden; +} +.hourbar-grid { position: absolute; top: 0; bottom: 0; width: 1px; background: rgba(0,0,0,.06); } +.hourbar-fill { position: absolute; top: 0; bottom: 0; border-radius: 999px; } +.hourbar-fill.day { background: #f0a052; } +.hourbar-fill.evening { background: #e07b3a; } +.hourbar-fill.night { background: #5566c4; } +.hourbar-fill.oncall { background: linear-gradient(90deg, #8a5cc0 25%, #b79be0 50%, #8a5cc0 75%); } +.hourbar-axis { display: flex; justify-content: space-between; font-size: 10px; color: var(--muted); margin-top: 3px; } + /* recommendation reason chips */ .rec-reasons { display: flex; flex-direction: column; gap: 4px; margin: 2px 0; } .rec-reason { font-size: 12px; color: var(--primary-dark); font-weight: 600; }