Files
DiunaBI/DiunaBI.Infrastructure/Validators/LlmAnomalyValidator.cs
2025-12-15 20:05:26 +01:00

497 lines
19 KiB
C#

using System.Text.Json;
using DiunaBI.Domain.Entities;
using DiunaBI.Infrastructure.Data;
using DiunaBI.Infrastructure.Plugins;
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.Logging;
using Microsoft.SemanticKernel;
using Microsoft.SemanticKernel.ChatCompletion;
using Microsoft.SemanticKernel.Connectors.OpenAI;
using Microsoft.SemanticKernel.Connectors.Ollama;
namespace DiunaBI.Infrastructure.Validators;
public class LlmAnomalyValidator : BaseDataValidator
{
public override string ValidatorType => "LlmAnomalyValidator";
private readonly AppDbContext _db;
private readonly IConfiguration _config;
private readonly ILogger<LlmAnomalyValidator> _logger;
private readonly Kernel _kernel;
// Configuration loaded from appsettings.json
private readonly string _provider;
private readonly string _model;
private readonly int _minHistoricalImports;
private readonly int _recentImportsWindow;
private readonly int _monthlyImportsWindow;
private readonly double _confidenceThreshold;
// Configuration loaded from ValidationWorker records
private string? SourceLayerName { get; set; }
private Layer? SourceImportWorker { get; set; }
public LlmAnomalyValidator(
AppDbContext db,
IConfiguration config,
ILogger<LlmAnomalyValidator> logger)
{
_db = db;
_config = config;
_logger = logger;
// Load configuration from appsettings.json
_provider = config["AnomalyDetection:Provider"] ?? "OpenAI";
_model = config["AnomalyDetection:Model"] ?? "gpt-4o-mini";
_minHistoricalImports = int.Parse(config["AnomalyDetection:MinHistoricalImports"] ?? "5");
_recentImportsWindow = int.Parse(config["AnomalyDetection:RecentImportsWindow"] ?? "5");
_monthlyImportsWindow = int.Parse(config["AnomalyDetection:MonthlyImportsWindow"] ?? "5");
_confidenceThreshold = double.Parse(config["AnomalyDetection:ConfidenceThreshold"] ?? "0.7");
// Initialize Semantic Kernel based on provider
_kernel = InitializeKernel();
_logger.LogInformation("LlmAnomalyValidator initialized with provider: {Provider}, model: {Model}",
_provider, _model);
}
private Kernel InitializeKernel()
{
var builder = Kernel.CreateBuilder();
switch (_provider.ToLower())
{
case "openai":
var openAiKey = _config["AnomalyDetection:ApiKey"];
if (string.IsNullOrEmpty(openAiKey))
{
throw new InvalidOperationException("OpenAI API key not configured");
}
builder.AddOpenAIChatCompletion(_model, openAiKey);
break;
case "azureopenai":
var azureEndpoint = _config["AnomalyDetection:Endpoint"];
var azureKey = _config["AnomalyDetection:ApiKey"];
if (string.IsNullOrEmpty(azureEndpoint) || string.IsNullOrEmpty(azureKey))
{
throw new InvalidOperationException("Azure OpenAI endpoint or API key not configured");
}
builder.AddAzureOpenAIChatCompletion(_model, azureEndpoint, azureKey);
break;
case "ollama":
var ollamaEndpoint = _config["AnomalyDetection:Endpoint"] ?? "http://localhost:11434";
builder.AddOllamaChatCompletion(_model, new Uri(ollamaEndpoint));
break;
default:
throw new NotSupportedException($"LLM provider '{_provider}' is not supported");
}
return builder.Build();
}
public override void Validate(Layer validationWorker)
{
try
{
_logger.LogInformation("{ValidatorType}: Starting validation for {ValidationWorkerName} ({ValidationWorkerId})",
ValidatorType, validationWorker.Name, validationWorker.Id);
// Load configuration from layer records
LoadConfiguration(validationWorker);
// Validate configuration
ValidateConfiguration();
// Find latest import layer
var latestImport = GetLatestImportLayer();
// Get historical context
var historicalImports = GetHistoricalImports();
// Check if enough historical data
if (historicalImports.Count < _minHistoricalImports)
{
_logger.LogWarning("{ValidatorType}: Not enough historical imports: {Count} (need {Min}). Skipping validation.",
ValidatorType, historicalImports.Count, _minHistoricalImports);
return;
}
// Perform validation
PerformValidation(validationWorker, latestImport, historicalImports);
_logger.LogInformation("{ValidatorType}: Successfully completed validation for {ValidationWorkerName}",
ValidatorType, validationWorker.Name);
}
catch (Exception e)
{
_logger.LogError(e, "{ValidatorType}: Failed to validate {ValidationWorkerName} ({ValidationWorkerId})",
ValidatorType, validationWorker.Name, validationWorker.Id);
throw;
}
}
private void LoadConfiguration(Layer validationWorker)
{
if (validationWorker.Records == null)
{
throw new InvalidOperationException("ValidationWorker has no records");
}
// Load source layer name (ImportWorker Administration Layer)
SourceLayerName = GetRecordValue(validationWorker.Records, "SourceLayer");
if (string.IsNullOrEmpty(SourceLayerName))
{
throw new InvalidOperationException("SourceLayer record not found");
}
_logger.LogDebug("{ValidatorType}: Configuration loaded - SourceLayer: {SourceLayer}",
ValidatorType, SourceLayerName);
}
private void ValidateConfiguration()
{
var errors = new List<string>();
if (string.IsNullOrEmpty(SourceLayerName)) errors.Add("SourceLayer is required");
// Find source import worker (Administration Layer)
SourceImportWorker = _db.Layers
.SingleOrDefault(x => x.Name == SourceLayerName &&
x.Type == LayerType.Administration &&
!x.IsDeleted &&
!x.IsCancelled);
if (SourceImportWorker == null)
{
errors.Add($"SourceImportWorker layer '{SourceLayerName}' not found");
}
if (errors.Any())
{
throw new InvalidOperationException($"Configuration validation failed: {string.Join(", ", errors)}");
}
_logger.LogDebug("{ValidatorType}: Configuration validation passed", ValidatorType);
}
private Layer GetLatestImportLayer()
{
// Find latest Import layer where ParentId = SourceImportWorker.Id
var latestImport = _db.Layers
.Include(x => x.Records)
.Where(x => x.ParentId == SourceImportWorker!.Id &&
x.Type == LayerType.Import &&
!x.IsDeleted &&
!x.IsCancelled)
.OrderByDescending(x => x.CreatedAt)
.FirstOrDefault();
if (latestImport == null)
{
throw new InvalidOperationException(
$"No import layers found for import worker '{SourceImportWorker!.Name}'");
}
_logger.LogDebug("{ValidatorType}: Found latest import layer: {LayerName} ({LayerId})",
ValidatorType, latestImport.Name, latestImport.Id);
return latestImport;
}
private List<Layer> GetHistoricalImports()
{
// Get last N import layers (ordered by CreatedAt)
var historicalImports = _db.Layers
.Include(x => x.Records)
.Where(x => x.ParentId == SourceImportWorker!.Id &&
x.Type == LayerType.Import &&
!x.IsDeleted &&
!x.IsCancelled)
.OrderByDescending(x => x.CreatedAt)
.Take(_recentImportsWindow)
.AsNoTracking()
.ToList();
_logger.LogDebug("{ValidatorType}: Found {Count} historical imports for recent window",
ValidatorType, historicalImports.Count);
return historicalImports;
}
private List<Layer> GetMonthlyBaselineImports()
{
// Get last N "first-of-month" import layers
var monthlyImports = _db.Layers
.Include(x => x.Records)
.Where(x => x.ParentId == SourceImportWorker!.Id &&
x.Type == LayerType.Import &&
x.CreatedAt.Day == 1 &&
!x.IsDeleted &&
!x.IsCancelled)
.OrderByDescending(x => x.CreatedAt)
.Take(_monthlyImportsWindow)
.AsNoTracking()
.ToList();
_logger.LogDebug("{ValidatorType}: Found {Count} monthly baseline imports",
ValidatorType, monthlyImports.Count);
return monthlyImports;
}
private void PerformValidation(Layer validationWorker, Layer latestImport, List<Layer> historicalImports)
{
_logger.LogDebug("{ValidatorType}: Performing validation for import: {ImportName}",
ValidatorType, latestImport.Name);
// Get monthly baseline if available
var monthlyBaseline = GetMonthlyBaselineImports();
// Build prompt with all data
var prompt = BuildPrompt(latestImport, historicalImports, monthlyBaseline);
// Call LLM
var startTime = DateTime.UtcNow;
var llmResponse = CallLlm(prompt);
var processingTime = DateTime.UtcNow - startTime;
// Create Validation Layer with results
var validationLayer = CreateValidationLayer(validationWorker, latestImport, llmResponse, processingTime);
// Save to database
SaveValidationLayer(validationLayer, llmResponse);
_logger.LogInformation("{ValidatorType}: Created validation layer {LayerName} ({LayerId}) in {ProcessingTime}ms",
ValidatorType, validationLayer.Name, validationLayer.Id, processingTime.TotalMilliseconds);
}
private string BuildPrompt(Layer currentImport, List<Layer> recentImports, List<Layer> monthlyBaseline)
{
var currentRecords = currentImport.Records?.OrderBy(r => r.Code).ToList() ?? new List<Record>();
var importType = SourceImportWorker?.Name ?? "Unknown";
var prompt = $@"You are a data quality analyst specializing in anomaly detection for business intelligence imports.
**Import Type:** {importType}
**Import Date:** {currentImport.CreatedAt:yyyy-MM-dd HH:mm:ss}
**Current Import:** {currentImport.Name}
**Current Import Data ({currentRecords.Count} records):**
{JsonSerializer.Serialize(currentRecords.Select(r => new { code = r.Code, value1 = r.Value1 }), new JsonSerializerOptions { WriteIndented = true })}
**Historical Context - Last {recentImports.Count} Imports:**
{string.Join("\n", recentImports.Select((imp, idx) => $"Import {idx + 1} ({imp.CreatedAt:yyyy-MM-dd}): {JsonSerializer.Serialize(imp.Records?.OrderBy(r => r.Code).Select(r => new { code = r.Code, value1 = r.Value1 }) ?? Enumerable.Empty<object>())}"))}
";
if (monthlyBaseline.Any())
{
prompt += $@"
**Monthly Baseline - Last {monthlyBaseline.Count} First-Day Imports:**
{string.Join("\n", monthlyBaseline.Select((imp, idx) => $"Monthly Import {idx + 1} ({imp.CreatedAt:yyyy-MM-dd}): {JsonSerializer.Serialize(imp.Records?.OrderBy(r => r.Code).Select(r => new { code = r.Code, value1 = r.Value1 }) ?? Enumerable.Empty<object>())}"))}
";
}
prompt += @"
**Analysis Tasks:**
1. **Record-level anomalies:** Identify unusual values for specific codes compared to historical patterns
2. **Structural issues:** Detect missing codes, new codes, or unexpected count changes
3. **Pattern breaks:** Find trend reversals, unexpected correlations, or statistical outliers
**Response Format (JSON):**
```json
{
""overallStatus"": ""pass|warning|critical"",
""recordAnomalies"": [
{
""code"": ""string"",
""value1"": number,
""confidence"": 0.0-1.0,
""severity"": ""low|medium|high|critical"",
""reason"": ""brief explanation"",
""recommendation"": ""suggested action""
}
],
""structuralIssues"": [
{
""issueType"": ""missing_codes|new_codes|count_change"",
""description"": ""string"",
""codes"": [""code1"", ""code2""],
""severity"": ""low|medium|high|critical""
}
],
""summary"": ""Brief overall assessment""
}
```
Analyze the data and respond ONLY with the JSON object. Do not include any markdown formatting or additional text.";
return prompt;
}
private AnomalyResponse CallLlm(string prompt)
{
try
{
var chatService = _kernel.GetRequiredService<IChatCompletionService>();
var chatHistory = new ChatHistory();
chatHistory.AddUserMessage(prompt);
var result = chatService.GetChatMessageContentAsync(
chatHistory,
new OpenAIPromptExecutionSettings
{
Temperature = _config.GetValue<double?>("AnomalyDetection:Temperature") ?? 0.1,
MaxTokens = _config.GetValue<int?>("AnomalyDetection:MaxTokens") ?? 4000
}).GetAwaiter().GetResult();
var jsonResponse = result.Content?.Trim() ?? "{}";
// Try to parse JSON response
try
{
return JsonSerializer.Deserialize<AnomalyResponse>(jsonResponse)
?? throw new InvalidOperationException("LLM returned null response");
}
catch (JsonException)
{
_logger.LogWarning("Failed to parse LLM response as JSON. Raw response: {Response}", jsonResponse);
throw new InvalidOperationException($"LLM did not return valid JSON. Response: {jsonResponse}");
}
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to call LLM for anomaly detection");
throw;
}
}
private Layer CreateValidationLayer(Layer validationWorker, Layer importLayer, AnomalyResponse response, TimeSpan processingTime)
{
var layerNumber = _db.Layers.Count() + 1;
var timestamp = DateTime.UtcNow.ToString("yyyyMMddHHmmss");
var validationLayer = new Layer
{
Id = Guid.NewGuid(),
Type = LayerType.Validation,
ParentId = importLayer.Id, // Links to the import that was validated
Number = layerNumber,
Name = $"L{layerNumber}-V-{timestamp}",
CreatedById = User.AutoImportUserId,
ModifiedById = User.AutoImportUserId,
CreatedAt = DateTime.UtcNow,
ModifiedAt = DateTime.UtcNow
};
_logger.LogDebug("{ValidatorType}: Created validation layer {LayerName}",
ValidatorType, validationLayer.Name);
return validationLayer;
}
private void SaveValidationLayer(Layer validationLayer, AnomalyResponse response)
{
// Add the validation layer
_db.Layers.Add(validationLayer);
var records = new List<Record>();
// Add metadata records
records.Add(CreateRecord(validationLayer.Id, "ValidatedAt", DateTime.UtcNow.ToString("yyyy-MM-dd HH:mm:ss")));
records.Add(CreateRecord(validationLayer.Id, "OverallStatus", response.OverallStatus));
records.Add(CreateRecord(validationLayer.Id, "RecordsChecked", value1: response.RecordAnomalies?.Count ?? 0));
records.Add(CreateRecord(validationLayer.Id, "AnomaliesDetected", value1: response.RecordAnomalies?.Count ?? 0));
records.Add(CreateRecord(validationLayer.Id, "StructuralIssuesDetected", value1: response.StructuralIssues?.Count ?? 0));
records.Add(CreateRecord(validationLayer.Id, "LlmProvider", _provider));
records.Add(CreateRecord(validationLayer.Id, "LlmModel", _model));
records.Add(CreateRecord(validationLayer.Id, "Summary", response.Summary));
// Add individual anomaly records
if (response.RecordAnomalies != null)
{
foreach (var anomaly in response.RecordAnomalies)
{
records.Add(CreateRecord(
validationLayer.Id,
$"ANOMALY_{anomaly.Code}",
$"[{anomaly.Severity}] {anomaly.Reason}. Recommendation: {anomaly.Recommendation}",
anomaly.Confidence
));
}
}
// Add structural issue records
if (response.StructuralIssues != null)
{
foreach (var issue in response.StructuralIssues)
{
var codes = issue.Codes != null ? string.Join(", ", issue.Codes) : "";
records.Add(CreateRecord(
validationLayer.Id,
$"STRUCTURAL_{issue.IssueType?.ToUpper()}",
$"[{issue.Severity}] {issue.Description}. Codes: {codes}"
));
}
}
// Store full LLM response as JSON (for debugging)
records.Add(CreateRecord(validationLayer.Id, "LLM_RESPONSE_JSON", JsonSerializer.Serialize(response)));
// Add all records to database
_db.Records.AddRange(records);
_db.SaveChanges();
_logger.LogDebug("{ValidatorType}: Saved {RecordCount} records for validation layer {LayerId}",
ValidatorType, records.Count, validationLayer.Id);
}
private Record CreateRecord(Guid layerId, string code, string? desc1 = null, double? value1 = null)
{
return new Record
{
Id = Guid.NewGuid(),
LayerId = layerId,
Code = code,
Desc1 = desc1,
Value1 = value1,
CreatedById = User.AutoImportUserId,
ModifiedById = User.AutoImportUserId,
CreatedAt = DateTime.UtcNow,
ModifiedAt = DateTime.UtcNow
};
}
}
// Response models for LLM
public class AnomalyResponse
{
public string OverallStatus { get; set; } = "pass";
public List<RecordAnomaly>? RecordAnomalies { get; set; }
public List<StructuralIssue>? StructuralIssues { get; set; }
public string Summary { get; set; } = "";
}
public class RecordAnomaly
{
public string Code { get; set; } = "";
public double? Value1 { get; set; }
public double Confidence { get; set; }
public string Severity { get; set; } = "low";
public string Reason { get; set; } = "";
public string Recommendation { get; set; } = "";
}
public class StructuralIssue
{
public string? IssueType { get; set; }
public string Description { get; set; } = "";
public List<string>? Codes { get; set; }
public string Severity { get; set; } = "low";
}