497 lines
19 KiB
C#
497 lines
19 KiB
C#
using System.Text.Json;
|
|
using DiunaBI.Domain.Entities;
|
|
using DiunaBI.Infrastructure.Data;
|
|
using DiunaBI.Infrastructure.Plugins;
|
|
using Microsoft.EntityFrameworkCore;
|
|
using Microsoft.Extensions.Configuration;
|
|
using Microsoft.Extensions.Logging;
|
|
using Microsoft.SemanticKernel;
|
|
using Microsoft.SemanticKernel.ChatCompletion;
|
|
using Microsoft.SemanticKernel.Connectors.OpenAI;
|
|
using Microsoft.SemanticKernel.Connectors.Ollama;
|
|
|
|
namespace DiunaBI.Infrastructure.Validators;
|
|
|
|
public class LlmAnomalyValidator : BaseDataValidator
|
|
{
|
|
public override string ValidatorType => "LlmAnomalyValidator";
|
|
|
|
private readonly AppDbContext _db;
|
|
private readonly IConfiguration _config;
|
|
private readonly ILogger<LlmAnomalyValidator> _logger;
|
|
private readonly Kernel _kernel;
|
|
|
|
// Configuration loaded from appsettings.json
|
|
private readonly string _provider;
|
|
private readonly string _model;
|
|
private readonly int _minHistoricalImports;
|
|
private readonly int _recentImportsWindow;
|
|
private readonly int _monthlyImportsWindow;
|
|
private readonly double _confidenceThreshold;
|
|
|
|
// Configuration loaded from ValidationWorker records
|
|
private string? SourceLayerName { get; set; }
|
|
private Layer? SourceImportWorker { get; set; }
|
|
|
|
public LlmAnomalyValidator(
|
|
AppDbContext db,
|
|
IConfiguration config,
|
|
ILogger<LlmAnomalyValidator> logger)
|
|
{
|
|
_db = db;
|
|
_config = config;
|
|
_logger = logger;
|
|
|
|
// Load configuration from appsettings.json
|
|
_provider = config["AnomalyDetection:Provider"] ?? "OpenAI";
|
|
_model = config["AnomalyDetection:Model"] ?? "gpt-4o-mini";
|
|
_minHistoricalImports = int.Parse(config["AnomalyDetection:MinHistoricalImports"] ?? "5");
|
|
_recentImportsWindow = int.Parse(config["AnomalyDetection:RecentImportsWindow"] ?? "5");
|
|
_monthlyImportsWindow = int.Parse(config["AnomalyDetection:MonthlyImportsWindow"] ?? "5");
|
|
_confidenceThreshold = double.Parse(config["AnomalyDetection:ConfidenceThreshold"] ?? "0.7");
|
|
|
|
// Initialize Semantic Kernel based on provider
|
|
_kernel = InitializeKernel();
|
|
|
|
_logger.LogInformation("LlmAnomalyValidator initialized with provider: {Provider}, model: {Model}",
|
|
_provider, _model);
|
|
}
|
|
|
|
private Kernel InitializeKernel()
|
|
{
|
|
var builder = Kernel.CreateBuilder();
|
|
|
|
switch (_provider.ToLower())
|
|
{
|
|
case "openai":
|
|
var openAiKey = _config["AnomalyDetection:ApiKey"];
|
|
if (string.IsNullOrEmpty(openAiKey))
|
|
{
|
|
throw new InvalidOperationException("OpenAI API key not configured");
|
|
}
|
|
builder.AddOpenAIChatCompletion(_model, openAiKey);
|
|
break;
|
|
|
|
case "azureopenai":
|
|
var azureEndpoint = _config["AnomalyDetection:Endpoint"];
|
|
var azureKey = _config["AnomalyDetection:ApiKey"];
|
|
if (string.IsNullOrEmpty(azureEndpoint) || string.IsNullOrEmpty(azureKey))
|
|
{
|
|
throw new InvalidOperationException("Azure OpenAI endpoint or API key not configured");
|
|
}
|
|
builder.AddAzureOpenAIChatCompletion(_model, azureEndpoint, azureKey);
|
|
break;
|
|
|
|
case "ollama":
|
|
var ollamaEndpoint = _config["AnomalyDetection:Endpoint"] ?? "http://localhost:11434";
|
|
builder.AddOllamaChatCompletion(_model, new Uri(ollamaEndpoint));
|
|
break;
|
|
|
|
default:
|
|
throw new NotSupportedException($"LLM provider '{_provider}' is not supported");
|
|
}
|
|
|
|
return builder.Build();
|
|
}
|
|
|
|
public override void Validate(Layer validationWorker)
|
|
{
|
|
try
|
|
{
|
|
_logger.LogInformation("{ValidatorType}: Starting validation for {ValidationWorkerName} ({ValidationWorkerId})",
|
|
ValidatorType, validationWorker.Name, validationWorker.Id);
|
|
|
|
// Load configuration from layer records
|
|
LoadConfiguration(validationWorker);
|
|
|
|
// Validate configuration
|
|
ValidateConfiguration();
|
|
|
|
// Find latest import layer
|
|
var latestImport = GetLatestImportLayer();
|
|
|
|
// Get historical context
|
|
var historicalImports = GetHistoricalImports();
|
|
|
|
// Check if enough historical data
|
|
if (historicalImports.Count < _minHistoricalImports)
|
|
{
|
|
_logger.LogWarning("{ValidatorType}: Not enough historical imports: {Count} (need {Min}). Skipping validation.",
|
|
ValidatorType, historicalImports.Count, _minHistoricalImports);
|
|
return;
|
|
}
|
|
|
|
// Perform validation
|
|
PerformValidation(validationWorker, latestImport, historicalImports);
|
|
|
|
_logger.LogInformation("{ValidatorType}: Successfully completed validation for {ValidationWorkerName}",
|
|
ValidatorType, validationWorker.Name);
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
_logger.LogError(e, "{ValidatorType}: Failed to validate {ValidationWorkerName} ({ValidationWorkerId})",
|
|
ValidatorType, validationWorker.Name, validationWorker.Id);
|
|
throw;
|
|
}
|
|
}
|
|
|
|
private void LoadConfiguration(Layer validationWorker)
|
|
{
|
|
if (validationWorker.Records == null)
|
|
{
|
|
throw new InvalidOperationException("ValidationWorker has no records");
|
|
}
|
|
|
|
// Load source layer name (ImportWorker Administration Layer)
|
|
SourceLayerName = GetRecordValue(validationWorker.Records, "SourceLayer");
|
|
if (string.IsNullOrEmpty(SourceLayerName))
|
|
{
|
|
throw new InvalidOperationException("SourceLayer record not found");
|
|
}
|
|
|
|
_logger.LogDebug("{ValidatorType}: Configuration loaded - SourceLayer: {SourceLayer}",
|
|
ValidatorType, SourceLayerName);
|
|
}
|
|
|
|
private void ValidateConfiguration()
|
|
{
|
|
var errors = new List<string>();
|
|
|
|
if (string.IsNullOrEmpty(SourceLayerName)) errors.Add("SourceLayer is required");
|
|
|
|
// Find source import worker (Administration Layer)
|
|
SourceImportWorker = _db.Layers
|
|
.SingleOrDefault(x => x.Name == SourceLayerName &&
|
|
x.Type == LayerType.Administration &&
|
|
!x.IsDeleted &&
|
|
!x.IsCancelled);
|
|
|
|
if (SourceImportWorker == null)
|
|
{
|
|
errors.Add($"SourceImportWorker layer '{SourceLayerName}' not found");
|
|
}
|
|
|
|
if (errors.Any())
|
|
{
|
|
throw new InvalidOperationException($"Configuration validation failed: {string.Join(", ", errors)}");
|
|
}
|
|
|
|
_logger.LogDebug("{ValidatorType}: Configuration validation passed", ValidatorType);
|
|
}
|
|
|
|
private Layer GetLatestImportLayer()
|
|
{
|
|
// Find latest Import layer where ParentId = SourceImportWorker.Id
|
|
var latestImport = _db.Layers
|
|
.Include(x => x.Records)
|
|
.Where(x => x.ParentId == SourceImportWorker!.Id &&
|
|
x.Type == LayerType.Import &&
|
|
!x.IsDeleted &&
|
|
!x.IsCancelled)
|
|
.OrderByDescending(x => x.CreatedAt)
|
|
.FirstOrDefault();
|
|
|
|
if (latestImport == null)
|
|
{
|
|
throw new InvalidOperationException(
|
|
$"No import layers found for import worker '{SourceImportWorker!.Name}'");
|
|
}
|
|
|
|
_logger.LogDebug("{ValidatorType}: Found latest import layer: {LayerName} ({LayerId})",
|
|
ValidatorType, latestImport.Name, latestImport.Id);
|
|
|
|
return latestImport;
|
|
}
|
|
|
|
private List<Layer> GetHistoricalImports()
|
|
{
|
|
// Get last N import layers (ordered by CreatedAt)
|
|
var historicalImports = _db.Layers
|
|
.Include(x => x.Records)
|
|
.Where(x => x.ParentId == SourceImportWorker!.Id &&
|
|
x.Type == LayerType.Import &&
|
|
!x.IsDeleted &&
|
|
!x.IsCancelled)
|
|
.OrderByDescending(x => x.CreatedAt)
|
|
.Take(_recentImportsWindow)
|
|
.AsNoTracking()
|
|
.ToList();
|
|
|
|
_logger.LogDebug("{ValidatorType}: Found {Count} historical imports for recent window",
|
|
ValidatorType, historicalImports.Count);
|
|
|
|
return historicalImports;
|
|
}
|
|
|
|
private List<Layer> GetMonthlyBaselineImports()
|
|
{
|
|
// Get last N "first-of-month" import layers
|
|
var monthlyImports = _db.Layers
|
|
.Include(x => x.Records)
|
|
.Where(x => x.ParentId == SourceImportWorker!.Id &&
|
|
x.Type == LayerType.Import &&
|
|
x.CreatedAt.Day == 1 &&
|
|
!x.IsDeleted &&
|
|
!x.IsCancelled)
|
|
.OrderByDescending(x => x.CreatedAt)
|
|
.Take(_monthlyImportsWindow)
|
|
.AsNoTracking()
|
|
.ToList();
|
|
|
|
_logger.LogDebug("{ValidatorType}: Found {Count} monthly baseline imports",
|
|
ValidatorType, monthlyImports.Count);
|
|
|
|
return monthlyImports;
|
|
}
|
|
|
|
private void PerformValidation(Layer validationWorker, Layer latestImport, List<Layer> historicalImports)
|
|
{
|
|
_logger.LogDebug("{ValidatorType}: Performing validation for import: {ImportName}",
|
|
ValidatorType, latestImport.Name);
|
|
|
|
// Get monthly baseline if available
|
|
var monthlyBaseline = GetMonthlyBaselineImports();
|
|
|
|
// Build prompt with all data
|
|
var prompt = BuildPrompt(latestImport, historicalImports, monthlyBaseline);
|
|
|
|
// Call LLM
|
|
var startTime = DateTime.UtcNow;
|
|
var llmResponse = CallLlm(prompt);
|
|
var processingTime = DateTime.UtcNow - startTime;
|
|
|
|
// Create Validation Layer with results
|
|
var validationLayer = CreateValidationLayer(validationWorker, latestImport, llmResponse, processingTime);
|
|
|
|
// Save to database
|
|
SaveValidationLayer(validationLayer, llmResponse);
|
|
|
|
_logger.LogInformation("{ValidatorType}: Created validation layer {LayerName} ({LayerId}) in {ProcessingTime}ms",
|
|
ValidatorType, validationLayer.Name, validationLayer.Id, processingTime.TotalMilliseconds);
|
|
}
|
|
|
|
private string BuildPrompt(Layer currentImport, List<Layer> recentImports, List<Layer> monthlyBaseline)
|
|
{
|
|
var currentRecords = currentImport.Records?.OrderBy(r => r.Code).ToList() ?? new List<Record>();
|
|
var importType = SourceImportWorker?.Name ?? "Unknown";
|
|
|
|
var prompt = $@"You are a data quality analyst specializing in anomaly detection for business intelligence imports.
|
|
|
|
**Import Type:** {importType}
|
|
**Import Date:** {currentImport.CreatedAt:yyyy-MM-dd HH:mm:ss}
|
|
**Current Import:** {currentImport.Name}
|
|
|
|
**Current Import Data ({currentRecords.Count} records):**
|
|
{JsonSerializer.Serialize(currentRecords.Select(r => new { code = r.Code, value1 = r.Value1 }), new JsonSerializerOptions { WriteIndented = true })}
|
|
|
|
**Historical Context - Last {recentImports.Count} Imports:**
|
|
{string.Join("\n", recentImports.Select((imp, idx) => $"Import {idx + 1} ({imp.CreatedAt:yyyy-MM-dd}): {JsonSerializer.Serialize(imp.Records?.OrderBy(r => r.Code).Select(r => new { code = r.Code, value1 = r.Value1 }) ?? Enumerable.Empty<object>())}"))}
|
|
";
|
|
|
|
if (monthlyBaseline.Any())
|
|
{
|
|
prompt += $@"
|
|
**Monthly Baseline - Last {monthlyBaseline.Count} First-Day Imports:**
|
|
{string.Join("\n", monthlyBaseline.Select((imp, idx) => $"Monthly Import {idx + 1} ({imp.CreatedAt:yyyy-MM-dd}): {JsonSerializer.Serialize(imp.Records?.OrderBy(r => r.Code).Select(r => new { code = r.Code, value1 = r.Value1 }) ?? Enumerable.Empty<object>())}"))}
|
|
";
|
|
}
|
|
|
|
prompt += @"
|
|
**Analysis Tasks:**
|
|
1. **Record-level anomalies:** Identify unusual values for specific codes compared to historical patterns
|
|
2. **Structural issues:** Detect missing codes, new codes, or unexpected count changes
|
|
3. **Pattern breaks:** Find trend reversals, unexpected correlations, or statistical outliers
|
|
|
|
**Response Format (JSON):**
|
|
```json
|
|
{
|
|
""overallStatus"": ""pass|warning|critical"",
|
|
""recordAnomalies"": [
|
|
{
|
|
""code"": ""string"",
|
|
""value1"": number,
|
|
""confidence"": 0.0-1.0,
|
|
""severity"": ""low|medium|high|critical"",
|
|
""reason"": ""brief explanation"",
|
|
""recommendation"": ""suggested action""
|
|
}
|
|
],
|
|
""structuralIssues"": [
|
|
{
|
|
""issueType"": ""missing_codes|new_codes|count_change"",
|
|
""description"": ""string"",
|
|
""codes"": [""code1"", ""code2""],
|
|
""severity"": ""low|medium|high|critical""
|
|
}
|
|
],
|
|
""summary"": ""Brief overall assessment""
|
|
}
|
|
```
|
|
|
|
Analyze the data and respond ONLY with the JSON object. Do not include any markdown formatting or additional text.";
|
|
|
|
return prompt;
|
|
}
|
|
|
|
private AnomalyResponse CallLlm(string prompt)
|
|
{
|
|
try
|
|
{
|
|
var chatService = _kernel.GetRequiredService<IChatCompletionService>();
|
|
|
|
var chatHistory = new ChatHistory();
|
|
chatHistory.AddUserMessage(prompt);
|
|
|
|
var result = chatService.GetChatMessageContentAsync(
|
|
chatHistory,
|
|
new OpenAIPromptExecutionSettings
|
|
{
|
|
Temperature = _config.GetValue<double?>("AnomalyDetection:Temperature") ?? 0.1,
|
|
MaxTokens = _config.GetValue<int?>("AnomalyDetection:MaxTokens") ?? 4000
|
|
}).GetAwaiter().GetResult();
|
|
|
|
var jsonResponse = result.Content?.Trim() ?? "{}";
|
|
|
|
// Try to parse JSON response
|
|
try
|
|
{
|
|
return JsonSerializer.Deserialize<AnomalyResponse>(jsonResponse)
|
|
?? throw new InvalidOperationException("LLM returned null response");
|
|
}
|
|
catch (JsonException)
|
|
{
|
|
_logger.LogWarning("Failed to parse LLM response as JSON. Raw response: {Response}", jsonResponse);
|
|
throw new InvalidOperationException($"LLM did not return valid JSON. Response: {jsonResponse}");
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Failed to call LLM for anomaly detection");
|
|
throw;
|
|
}
|
|
}
|
|
|
|
private Layer CreateValidationLayer(Layer validationWorker, Layer importLayer, AnomalyResponse response, TimeSpan processingTime)
|
|
{
|
|
var layerNumber = _db.Layers.Count() + 1;
|
|
var timestamp = DateTime.UtcNow.ToString("yyyyMMddHHmmss");
|
|
|
|
var validationLayer = new Layer
|
|
{
|
|
Id = Guid.NewGuid(),
|
|
Type = LayerType.Validation,
|
|
ParentId = importLayer.Id, // Links to the import that was validated
|
|
Number = layerNumber,
|
|
Name = $"L{layerNumber}-V-{timestamp}",
|
|
CreatedById = User.AutoImportUserId,
|
|
ModifiedById = User.AutoImportUserId,
|
|
CreatedAt = DateTime.UtcNow,
|
|
ModifiedAt = DateTime.UtcNow
|
|
};
|
|
|
|
_logger.LogDebug("{ValidatorType}: Created validation layer {LayerName}",
|
|
ValidatorType, validationLayer.Name);
|
|
|
|
return validationLayer;
|
|
}
|
|
|
|
private void SaveValidationLayer(Layer validationLayer, AnomalyResponse response)
|
|
{
|
|
// Add the validation layer
|
|
_db.Layers.Add(validationLayer);
|
|
|
|
var records = new List<Record>();
|
|
|
|
// Add metadata records
|
|
records.Add(CreateRecord(validationLayer.Id, "ValidatedAt", DateTime.UtcNow.ToString("yyyy-MM-dd HH:mm:ss")));
|
|
records.Add(CreateRecord(validationLayer.Id, "OverallStatus", response.OverallStatus));
|
|
records.Add(CreateRecord(validationLayer.Id, "RecordsChecked", value1: response.RecordAnomalies?.Count ?? 0));
|
|
records.Add(CreateRecord(validationLayer.Id, "AnomaliesDetected", value1: response.RecordAnomalies?.Count ?? 0));
|
|
records.Add(CreateRecord(validationLayer.Id, "StructuralIssuesDetected", value1: response.StructuralIssues?.Count ?? 0));
|
|
records.Add(CreateRecord(validationLayer.Id, "LlmProvider", _provider));
|
|
records.Add(CreateRecord(validationLayer.Id, "LlmModel", _model));
|
|
records.Add(CreateRecord(validationLayer.Id, "Summary", response.Summary));
|
|
|
|
// Add individual anomaly records
|
|
if (response.RecordAnomalies != null)
|
|
{
|
|
foreach (var anomaly in response.RecordAnomalies)
|
|
{
|
|
records.Add(CreateRecord(
|
|
validationLayer.Id,
|
|
$"ANOMALY_{anomaly.Code}",
|
|
$"[{anomaly.Severity}] {anomaly.Reason}. Recommendation: {anomaly.Recommendation}",
|
|
anomaly.Confidence
|
|
));
|
|
}
|
|
}
|
|
|
|
// Add structural issue records
|
|
if (response.StructuralIssues != null)
|
|
{
|
|
foreach (var issue in response.StructuralIssues)
|
|
{
|
|
var codes = issue.Codes != null ? string.Join(", ", issue.Codes) : "";
|
|
records.Add(CreateRecord(
|
|
validationLayer.Id,
|
|
$"STRUCTURAL_{issue.IssueType?.ToUpper()}",
|
|
$"[{issue.Severity}] {issue.Description}. Codes: {codes}"
|
|
));
|
|
}
|
|
}
|
|
|
|
// Store full LLM response as JSON (for debugging)
|
|
records.Add(CreateRecord(validationLayer.Id, "LLM_RESPONSE_JSON", JsonSerializer.Serialize(response)));
|
|
|
|
// Add all records to database
|
|
_db.Records.AddRange(records);
|
|
_db.SaveChanges();
|
|
|
|
_logger.LogDebug("{ValidatorType}: Saved {RecordCount} records for validation layer {LayerId}",
|
|
ValidatorType, records.Count, validationLayer.Id);
|
|
}
|
|
|
|
private Record CreateRecord(Guid layerId, string code, string? desc1 = null, double? value1 = null)
|
|
{
|
|
return new Record
|
|
{
|
|
Id = Guid.NewGuid(),
|
|
LayerId = layerId,
|
|
Code = code,
|
|
Desc1 = desc1,
|
|
Value1 = value1,
|
|
CreatedById = User.AutoImportUserId,
|
|
ModifiedById = User.AutoImportUserId,
|
|
CreatedAt = DateTime.UtcNow,
|
|
ModifiedAt = DateTime.UtcNow
|
|
};
|
|
}
|
|
}
|
|
|
|
// Response models for LLM
|
|
public class AnomalyResponse
|
|
{
|
|
public string OverallStatus { get; set; } = "pass";
|
|
public List<RecordAnomaly>? RecordAnomalies { get; set; }
|
|
public List<StructuralIssue>? StructuralIssues { get; set; }
|
|
public string Summary { get; set; } = "";
|
|
}
|
|
|
|
public class RecordAnomaly
|
|
{
|
|
public string Code { get; set; } = "";
|
|
public double? Value1 { get; set; }
|
|
public double Confidence { get; set; }
|
|
public string Severity { get; set; } = "low";
|
|
public string Reason { get; set; } = "";
|
|
public string Recommendation { get; set; } = "";
|
|
}
|
|
|
|
public class StructuralIssue
|
|
{
|
|
public string? IssueType { get; set; }
|
|
public string Description { get; set; } = "";
|
|
public List<string>? Codes { get; set; }
|
|
public string Severity { get; set; } = "low";
|
|
}
|