MES-ETL/MesETL.App/HostedServices/FileInputService.cs

138 lines
5.4 KiB
C#
Raw Normal View History

using System.Runtime;
using MesETL.App.Const;
2024-12-10 14:03:09 +08:00
using MesETL.App.HostedServices.Abstractions;
using MesETL.App.Options;
using MesETL.App.Services;
using MesETL.App.Services.ETL;
2024-02-15 16:18:50 +08:00
using Microsoft.Extensions.Configuration;
2024-01-29 09:29:16 +08:00
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace MesETL.App.HostedServices;
2024-01-29 09:29:16 +08:00
public record FileInputInfo
{
public required string FileName { get; init; }
public required string TableName { get; init; }
2024-12-10 14:03:09 +08:00
public required string Database { get; init; }
public required int Part { get; init; }
2024-01-29 09:29:16 +08:00
public required string[] Headers { get; init; }
}
/// <summary>
/// 从输入目录中导入文件
/// </summary>
public class FileInputService : IInputService
{
private readonly ILogger _logger;
private readonly DataRecordQueue _producerQueue;
private readonly IOptions<DataInputOptions> _dataInputOptions;
private readonly ProcessContext _context;
private readonly DataReaderFactory _dataReaderFactory;
2024-02-15 16:18:50 +08:00
private readonly long _memoryThreshold;
2024-01-29 09:29:16 +08:00
public FileInputService(ILogger<FileInputService> logger,
IOptions<DataInputOptions> dataInputOptions,
ProcessContext context,
2024-02-06 15:37:21 +08:00
[FromKeyedServices(Const.ConstVar.Producer)] DataRecordQueue producerQueue,
2024-02-15 16:18:50 +08:00
DataReaderFactory dataReaderFactory,
IConfiguration configuration)
2024-01-29 09:29:16 +08:00
{
_logger = logger;
_dataInputOptions = dataInputOptions;
_context = context;
_producerQueue = producerQueue;
_dataReaderFactory = dataReaderFactory;
2024-02-15 16:18:50 +08:00
_memoryThreshold = (long)(configuration.GetValue<double>("MemoryThreshold", 8) * 1024 * 1024 * 1024);
2024-01-29 09:29:16 +08:00
}
public async Task ExecuteAsync(CancellationToken cancellationToken)
{
var inputDir = _dataInputOptions.Value.InputDir ?? throw new ApplicationException("未配置文件输入目录");
2024-12-10 14:03:09 +08:00
_logger.LogInformation("***** 输入服务已启动,工作目录为:{InputDir} *****", inputDir);
2024-01-29 09:29:16 +08:00
2024-12-10 14:03:09 +08:00
var orderedInfo = GetOrderedInputInfo(inputDir);
2024-01-29 09:29:16 +08:00
foreach (var info in orderedInfo)
{
2024-12-10 14:03:09 +08:00
var file = Path.GetFileName(info.FileName);
_logger.LogInformation("正在读取文件:{FileName}, 对应的数据表:{TableName}", file, info.TableName);
using var source = _dataReaderFactory.CreateReader(info.FileName, info.TableName, info.Headers);
2024-02-10 17:12:26 +08:00
var count = 0;
2024-01-29 09:29:16 +08:00
while (await source.ReadAsync())
{
2024-02-15 16:18:50 +08:00
if (GC.GetTotalMemory(false) > _memoryThreshold)
{
2024-12-10 14:03:09 +08:00
_logger.LogWarning("内存使用率过高,暂缓输入");
GCSettings.LargeObjectHeapCompactionMode = GCLargeObjectHeapCompactionMode.CompactOnce;
2024-02-15 16:18:50 +08:00
GC.Collect();
await Task.Delay(3000, cancellationToken);
}
2024-01-29 09:29:16 +08:00
var record = source.Current;
2024-02-09 19:08:57 +08:00
await _producerQueue.EnqueueAsync(record);
2024-02-10 17:12:26 +08:00
count++;
2024-01-29 09:29:16 +08:00
_context.AddInput();
}
2024-02-10 17:12:26 +08:00
_context.AddTableInput(info.TableName, count);
2024-12-10 14:03:09 +08:00
_logger.LogInformation("文件 {File} 输入完成", file);
2024-02-09 19:08:57 +08:00
_dataInputOptions.Value.OnTableInputCompleted?.Invoke(info.TableName);
2024-01-29 09:29:16 +08:00
}
_context.CompleteInput();
2024-12-10 14:03:09 +08:00
_logger.LogInformation("***** 输入服务已执行完毕 *****");
}
public IEnumerable<FileInputInfo> GetOrderedInputInfo(string dir)
{
var metaBuilder = _dataInputOptions.Value.FileInputMetaBuilder;
if(metaBuilder is null) throw new ApplicationException("未配置文件名->表名的映射委托函数");
var files = Directory.GetFiles(dir);
FileInputInfo[] infoArr = files
.Select(f => metaBuilder(f))
.Where(info => info is not null).ToArray()!;
var orderedInfo = GetFilesInOrder(infoArr).ToArray();
_logger.LogInformation("***** 输入目录中发现 {Count} 个文件, {InfoCount} 个文件可用,{OrderedCount} 个文件符合当前输入配置 *****",
files.Length, infoArr.Length, orderedInfo.Length);
foreach (var info in orderedInfo.GroupBy(i => i.TableName))
{
_logger.LogDebug("表 {TableName} 发现 {FileCount} 个对应文件:\n{FileName}",
info.Key, info.Count(), string.Join('\n', info.Select(f => f.FileName)));
}
return orderedInfo;
2024-01-29 09:29:16 +08:00
}
/// <summary>
/// 读取配置,按照配置的表顺序来返回
/// </summary>
/// <returns></returns>
private IEnumerable<FileInputInfo> GetFilesInOrder(FileInputInfo[] inputFiles)
{
2024-12-10 14:03:09 +08:00
var tableOrder = _dataInputOptions.Value.TableOrder ?? typeof(TableNames).GetFields().Select(f => f.GetValue(null) as string).ToArray();
2024-02-09 19:08:57 +08:00
var ignoreTable = _dataInputOptions.Value.TableIgnoreList;
2024-01-29 09:29:16 +08:00
if (tableOrder is null or { Length: 0 })
return inputFiles;
return Yield();
IEnumerable<FileInputInfo> Yield()
{
foreach (var tableName in tableOrder)
{
2024-12-10 14:03:09 +08:00
var targets = inputFiles.Where(f =>
f.TableName.Equals(tableName, StringComparison.OrdinalIgnoreCase) &&
!ignoreTable.Contains(f.TableName));
foreach (var target in targets)
{
2024-01-29 09:29:16 +08:00
yield return target;
2024-12-10 14:03:09 +08:00
}
2024-01-29 09:29:16 +08:00
}
}
}
}