2026-06-01 07:46:56 +03:30
using System.Text ;
using SoroushAsadi.Services ;
namespace SoroushAsadi.Pages.Blog ;
public class PostModel ( ContentService content ) : BasePageModel
{
[Microsoft.AspNetCore.Mvc.BindProperty(SupportsGet = true)]
public string Slug { get ; set ; } = "" ;
public string Title { get ; private set ; } = "" ;
public string Category { get ; private set ; } = "" ;
public int ReadTime { get ; private set ; }
public string BodyHtml { get ; private set ; } = "" ;
public bool PostNotFound { get ; private set ; }
// Default bodies (Markdown-lite, rendered server-side)
private static readonly Dictionary < string , ( string Cat , string TitleEn , string TitleFa , int RT , string Body ) > _defaults = new ( )
{
2026-06-26 03:50:21 +03:30
["rag-eval-framework"] = ( "LLM" , "A RAG evaluation framework that holds up in production" , "چارچوب ارزیابی RAG که در عمل جواب میدهد" , 8 , DefaultBodies . RagEval ) ,
2026-06-01 07:46:56 +03:30
["agentic-n8n-patterns"] = ( "Automation" , "Agentic patterns with n8n for the enterprise" , "الگوهای عاملمحور با n8n برای سازمان" , 11 , DefaultBodies . N8nPatterns ) ,
["vertex-cost-control"] = ( "Google Stack" , "Vertex AI cost control at scale" , "کنترل هزینه روی Vertex AI در مقیاس بالا" , 6 , DefaultBodies . VertexCost ) ,
2026-06-26 03:50:21 +03:30
["k8s-llm-inference"] = ( "Infra" , "Sub-50ms LLM inference on Kubernetes" , "اجرای LLM روی Kubernetes با تأخیر زیر ۵۰ میلیثانیه" , 14 , DefaultBodies . K8sInference ) ,
["flutter-on-device-ai"] = ( "Mobile" , "On-device AI in Flutter" , "هوش مصنوعی روی دستگاه در Flutter" , 9 , DefaultBodies . FlutterAI ) ,
["enterprise-ai-roadmap"] = ( "Strategy" , "A 90-day enterprise AI roadmap" , "نقشهی راه هوش مصنوعی سازمانی در ۹۰ روز" , 7 , DefaultBodies . EnterpriseRoadmap ) ,
2026-06-01 07:46:56 +03:30
} ;
public void OnGet ( )
{
if ( ! _defaults . TryGetValue ( Slug , out var def ) ) { PostNotFound = true ; return ; }
// Check for DB override (stored under "posts" key as slug→{body,...})
var overrides = content . GetPostOverrides ( ) ;
string body = def . Body ;
if ( overrides . TryGetValue ( Slug , out var node ) & & node [ "body" ] ? . GetValue < string > ( ) is { } dbBody )
body = dbBody ;
Title = IsFa ? def . TitleFa : def . TitleEn ;
Category = def . Cat ;
ReadTime = def . RT ;
BodyHtml = SimpleMarkdown ( body ) ;
}
// Minimal Markdown → HTML (headings, bold, code, paragraphs)
private static string SimpleMarkdown ( string md )
{
if ( string . IsNullOrWhiteSpace ( md ) ) return "" ;
var sb = new StringBuilder ( ) ;
foreach ( var rawLine in md . Split ( '\n' ) )
{
var line = rawLine . TrimEnd ( ) ;
if ( line . StartsWith ( "## " ) ) { sb . Append ( $"<h2>{Inline(line[3..])}</h2>\n" ) ; continue ; }
if ( line . StartsWith ( "### " ) ) { sb . Append ( $"<h3>{Inline(line[4..])}</h3>\n" ) ; continue ; }
if ( line . StartsWith ( "- " ) ) { sb . Append ( $"<li>{Inline(line[2..])}</li>\n" ) ; continue ; }
if ( string . IsNullOrWhiteSpace ( line ) ) { sb . Append ( '\n' ) ; continue ; }
sb . Append ( $"<p>{Inline(line)}</p>\n" ) ;
}
return sb . ToString ( ) ;
}
private static string Inline ( string s )
{
// **bold**, `code`, &, <, >
var sb = new StringBuilder ( ) ;
int i = 0 ;
while ( i < s . Length )
{
if ( i + 1 < s . Length & & s [ i ] = = '*' & & s [ i + 1 ] = = '*' )
{
int end = s . IndexOf ( "**" , i + 2 ) ;
if ( end > = 0 ) { sb . Append ( "<strong>" ) ; sb . Append ( Esc ( s [ ( i + 2 ) . . end ] ) ) ; sb . Append ( "</strong>" ) ; i = end + 2 ; continue ; }
}
if ( s [ i ] = = '`' )
{
int end = s . IndexOf ( '`' , i + 1 ) ;
if ( end > = 0 ) { sb . Append ( "<code>" ) ; sb . Append ( Esc ( s [ ( i + 1 ) . . end ] ) ) ; sb . Append ( "</code>" ) ; i = end + 1 ; continue ; }
}
sb . Append ( s [ i ] switch { '&' = > "&" , '<' = > "<" , '>' = > ">" , _ = > s [ i ] . ToString ( ) } ) ;
i + + ;
}
return sb . ToString ( ) ;
}
private static string Esc ( string s ) = > s . Replace ( "&" , "&" ) . Replace ( "<" , "<" ) . Replace ( ">" , ">" ) ;
}
/// Default article bodies (Markdown).
internal static class DefaultBodies
{
public const string RagEval = "" "
# # Why standard metrics fail for RAG
BLEU and ROUGE measure n - gram overlap against a reference answer . In a RAG system , there is often no single correct reference — a question about company policy may have dozens of valid phrasings . High BLEU does not mean the system cited the right source ; low BLEU does not mean it was wrong .
# # The three metrics that actually matter
* * Faithfulness * * measures whether every claim in the generated answer can be traced back to a retrieved passage . A faithfulness score of 1.0 means the model invented nothing . Tools like RAGAS implement this with an LLM judge .
* * Context Precision * * asks : of the passages retrieved , how many were actually relevant to the question ? Low precision wastes context window and increases hallucination risk .
* * Answer Relevancy * * checks whether the final response actually addresses what was asked — not just whether it sounds good .
# # Building an eval harness
Start with a * * golden dataset * * : 100 – 200 question / answer pairs that domain experts have verified . Run your pipeline against them nightly . Track the three metrics above over time . A drop in Faithfulness after a model upgrade is a red flag ; a drop in Context Precision after a chunking change means your retrieval is degrading .
The harness does not have to be complex . A spreadsheet with automatic scoring via the OpenAI or Anthropic API is enough to start catching regressions before they reach production .
"" ";
public const string N8nPatterns = "" "
# # The problem with "just use n8n"
n8n is excellent for integrating SaaS tools . It becomes fragile when you try to use it as an agent orchestrator — long - running loops , conditional retries , and LLM calls that can fail in non - obvious ways .
# # Separating orchestration from integration
The pattern that works : * * n8n handles triggers and integrations ; LangGraph handles agent logic * * .
An n8n workflow watches a Slack channel . When a message matches a pattern , it calls a LangGraph endpoint with the raw payload . LangGraph runs the multi - step reasoning loop , maintains state , and returns a structured result . n8n takes that result and routes it — posts to Jira , sends an email , updates a database row .
# # Making agents auditable
Every LangGraph state transition should emit an event to a structured log . We use a Postgres table with columns : ` run_id ` , ` step ` , ` input ` , ` output ` , ` timestamp ` . This table becomes the audit trail that compliance teams and on - call engineers both need .
Add a ` human_in_the_loop ` node for any action that cannot be undone — deleting records , sending external emails , approving payments . The node pauses execution and posts to Slack ; a human approves or rejects ; execution resumes .
# # Handling failures gracefully
LLM calls fail . Build * * retry with exponential backoff * * into every LangGraph node that calls an LLM . Set a hard limit of 3 retries , then route to a dead - letter state that pages the on - call engineer . Never silently swallow errors in agentic pipelines — a swallowed error is an invisible outage .
"" ";
public const string VertexCost = "" "
# # Anti - pattern 1 : calling Gemini Ultra for everything
Gemini Ultra ( or GPT - 4 - class models ) costs 10 – 30 × more per token than smaller models . Many teams default to the most capable model because it "just works" during prototyping , then never re - evaluate .
* * Fix * * : build a * * model router * * . Classify each incoming request by complexity . Simple lookups , short summaries , and classification tasks go to Gemini Flash or Haiku . Only complex reasoning , multi - step synthesis , and long - context tasks go to Pro or Ultra . In most production systems , 60 – 80 % of requests can be served by the cheaper tier .
# # Anti - pattern 2 : no context caching
Vertex AI supports prompt caching ( as does the Anthropic API ) . A system prompt that is 10 k tokens , sent with every request at $ 3 / M tokens , costs $ 30 for every million calls before the user has typed a single word .
* * Fix * * : cache any context that is static or changes infrequently — system prompts , retrieved document sets , few - shot examples . Cache hits cost ~ 10 % of full input price .
# # Anti - pattern 3 : synchronous batch jobs
Teams run nightly document processing jobs synchronously — one document at a time , each blocked on the previous . This is slow and expensive because you pay for idle wait time between calls .
* * Fix * * : use the Vertex AI batch prediction API for jobs over ~ 1 , 000 documents . Batch jobs run asynchronously , are eligible for spot discounts , and typically cost 50 % less per token than online serving .
"" ";
public const string K8sInference = "" "
# # The baseline architecture
A single Kubernetes ` Deployment ` behind a ` ClusterIP ` ` Service ` , fronted by an Ingress . Works fine up to ~ 50 RPS for a small model . Falls apart when traffic spikes , when GPU pods take 3 minutes to schedule , or when the model server has a 2 - second cold - start .
# # Autoscaling with KEDA
HPA ( Horizontal Pod Autoscaler ) scales on CPU and memory . LLM inference is GPU - bound and queue - depth - bound — neither maps to CPU utilization well .
KEDA ( Kubernetes Event - Driven Autoscaling ) scales on arbitrary metrics — queue depth , Pub / Sub lag , Redis list length . We publish inference request counts to a Redis stream ; KEDA scales the model server pods when the stream depth exceeds a threshold . Scaling - up latency drops from minutes ( cluster autoscaler cold start ) to seconds ( replica scale - up from 1 to N ) .
# # GPU sharing with time - slicing
For models that fit in 4 – 8 GB VRAM , full GPU dedication is wasteful . NVIDIA ' s time - slicing MIG ( Multi - Instance GPU ) lets multiple pods share one A100 , each getting a guaranteed slice .
Configure ` nvidia . com / gpu : 1 ` and set the time - slice profile to ` 1 g . 10 gb ` . A single A100 80 GB can serve 8 concurrent model instances at 10 GB each — 8 × the throughput per GPU .
# # Request hedging for tail latency
p50 latency is 12 ms . p99 is 280 ms . The tail is dominated by KV - cache misses and occasional GC pauses . * * Hedged requests * * : after 40 ms , send a duplicate request to a second replica . Take whichever response arrives first ; cancel the other . This cuts p99 from 280 ms to ~ 45 ms with only ~ 15 % increase in total compute .
"" ";
public const string FlutterAI = "" "
# # Why on - device inference matters
Cloud inference requires a network round - trip , exposes user data to a server , and fails in offline scenarios . For consumer apps — messaging , health , productivity — on - device inference is often a requirement , not a nice - to - have .
# # Gemini Nano and LiteRT
Google ' s Gemini Nano is a 1.8 B parameter model quantized to run on mobile NPUs ( Neural Processing Units ) . The Flutter integration uses the ` google_ai_dart_sdk ` package with ` GeminiNanoModel ` , falling back to cloud inference when the device model is unavailable .
LiteRT ( formerly TensorFlow Lite ) handles vision and custom small models . For classification and embedding tasks , a 50 MB quantized model runs in under 20 ms on a mid - range Android device .
# # Streaming UX without a network
The key insight : users tolerate slightly slower responses if they can see text appearing token by token . Even on - device inference can stream — Gemini Nano ' s Dart SDK exposes a ` generateContentStream ` method . Pipe tokens directly to a Flutter ` StreamBuilder ` for a responsive feel regardless of total generation time .
# # Battery and thermal management
On - device inference heats the chip . Implement * * thermal throttling * * : check ` DeviceInfo . thermalState ` ( iOS ) or subscribe to the battery API on Android . Reduce ` maxTokens ` from 512 to 128 during sustained load . Schedule background inference tasks during charging . Users notice neither the throttling nor the scheduling — they notice when their phone gets too hot .
"" ";
public const string EnterpriseRoadmap = "" "
# # Days 1 – 30 : discovery
The most expensive mistake in enterprise AI is building the wrong thing fast . Discovery is not a formality — it is the work .
Interview 8 – 12 stakeholders across business units . For each , ask : what manual task takes more than 2 hours per week ? What decision do you make with incomplete information ? What report do you wish existed but is too expensive to build ?
Map the candidates on a 2 × 2 : * * impact * * ( revenue , cost , risk ) vs * * feasibility * * ( data quality , integration complexity , regulatory constraints ) . The top - right quadrant is your first sprint .
# # Days 31 – 60 : prototype and validate
Pick one use case from the top - right . Build a prototype in 3 weeks . The prototype does not have to be production - grade — it has to be * * testable by domain experts * * .
Run a structured eval : 100 questions , domain expert scores each answer 1 – 5. Set a threshold ( e . g . , ≥ 4.0 average ) before the sprint begins . If the prototype clears it , proceed to production hardening . If it doesn ' t , investigate root cause — usually data quality or chunking strategy — before committing engineering resources .
# # Days 61 – 90 : first production deployment
Scope the first deployment to a single team of 10 – 20 people . This limits blast radius and generates real usage data fast .
Instrument everything : latency , cost per query , thumbs - up / thumbs - down from users , faithfulness score from the automated harness . Review metrics weekly with the business owner . Adjust chunking , retrieval strategy , or model tier based on what the data shows — not intuition .
At day 90 , you have a live system , a tuned eval harness , and a clear picture of what the second use case should be . That is the foundation for a credible 12 - month roadmap .
"" ";
}