This commit is contained in:
2024-01-29 09:29:16 +08:00
parent 4f96b77e55
commit 083090c62b
63 changed files with 2479 additions and 1491 deletions

View File

@@ -0,0 +1,106 @@
using System.Text;
using ConsoleApp2.HostedServices.Abstractions;
using Microsoft.Extensions.Logging;
namespace ConsoleApp2.Services.ETL;
/// <summary>
/// CSV文件读取
/// </summary>
public class CsvReader : IDataReader
{
protected readonly string? FilePath;
protected readonly Lazy<StreamReader> Reader;
protected readonly ILogger? Logger;
protected readonly string TableName;
public DataRecord Current { get; protected set; } = null!;
public string[] Headers { get; }
public string? CurrentRaw { get; protected set; }
public string Delimiter { get; }
public char QuoteChar { get; }
public CsvReader(Stream stream, string tableName, string[] headers, string delimiter = ",", char quoteChar = '"', ILogger? logger = null)
: this(tableName, headers, delimiter, quoteChar, logger)
{
Reader = new Lazy<StreamReader>(() => new StreamReader(stream));
}
public CsvReader(string filePath, string tableName, string[] headers, string delimiter = ",", char quoteChar = '"', ILogger? logger = null)
: this(tableName, headers, delimiter, quoteChar, logger)
{
var fs = File.OpenRead(filePath);
FilePath = filePath;
Reader = new Lazy<StreamReader>(() => new StreamReader(fs));
}
private CsvReader(string tableName, string[] headers, string delimiter = ",", char quoteChar = '"', ILogger? logger = null)
{
TableName = tableName;
Headers = headers;
Logger = logger;
Delimiter = delimiter;
QuoteChar = quoteChar;
Reader = null!;
}
public virtual async ValueTask<bool> ReadAsync()
{
var str = await Reader.Value.ReadLineAsync();
if (string.IsNullOrWhiteSpace(str))
return false;
CurrentRaw = str;
var fields = ParseRow(str, QuoteChar, Delimiter);
Current = new DataRecord(fields, TableName, Headers){RawField = str};
return true;
}
public string[] ParseRow(ReadOnlySpan<char> source, char quoteChar, string delimiter)
{
var result = new List<string>();
var index = -1;
var current = new StringBuilder();
var hasQuote = false;
var hasSlash = false;
while (index < source.Length - 1)
{
index++;
if (hasSlash == false && source[index] == '\\')
{
hasSlash = true;
current.Append('\\');
continue;
}
if (hasSlash == false && source[index] == quoteChar)
{
hasQuote = !hasQuote;
current.Append(source[index]);
continue;
}
if (hasQuote == false && source[index] == delimiter[0])
{
result.Add(current.ToString());
current.Clear();
}
else
{
current.Append(source[index]);
}
hasSlash = false;
}
result.Add(current.ToString());
return result.ToArray();
}
public virtual void Dispose()
{
if(Reader.IsValueCreated)
Reader.Value.Dispose();
}
}

View File

@@ -0,0 +1,46 @@
using ConsoleApp2.HostedServices.Abstractions;
using ConsoleApp2.Options;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace ConsoleApp2.Services.ETL;
public class DataReaderFactory
{
private readonly ILogger<DataReaderFactory> _logger;
private readonly IOptions<DataInputOptions> _options;
public DataReaderFactory(ILogger<DataReaderFactory> logger, IOptions<DataInputOptions> options)
{
_logger = logger;
_options = options;
}
public IDataReader CreateReader(string filePath, string tableName, string[] headers)
{
if (_options.Value.UseMock)
{
if (_options.Value.TableMockConfig is null)
throw new ApplicationException("未配置表模拟数据量级");
_logger.LogDebug("***** Using {Type} data source *****", "ZSTD mock");
var mockConfig = _options.Value.TableMockConfig.GetValueOrDefault(tableName,
new TableMockConfig { MockCount = 1, UseDeepCopy = false });
mockConfig.MockCount = (long)Math.Ceiling(mockConfig.MockCount * _options.Value.MockCountMultiplier);
return new ZstMockReader(mockConfig, filePath,
tableName, headers, _options.Value.Delimiter, _options.Value.QuoteChar, _logger);
}
_logger.LogDebug("***** Using {Type} data source *****", "ZSTD");
return new ZstReader(filePath, tableName, headers, _options.Value.Delimiter, _options.Value.QuoteChar, _logger);
}
}
public static class DataSourceFactoryExtensions
{
public static IServiceCollection AddDataSourceFactory(this IServiceCollection services)
{
services.AddSingleton<DataReaderFactory>();
return services;
}
}

View File

@@ -0,0 +1,215 @@
using System.Text;
using System.Text.RegularExpressions;
using ConsoleApp2.Helpers;
using ConsoleApp2.Options;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using MySqlConnector;
namespace ConsoleApp2.Services.ETL;
/// <summary>
/// Mysql导出
/// </summary>
public partial class MySqlDestination : IDisposable, IAsyncDisposable
{
private readonly Dictionary<string, IList<DataRecord>> _recordCache;
private readonly MySqlConnection _conn;
private readonly ILogger _logger;
private readonly IOptions<DatabaseOutputOptions> _options;
private readonly ErrorRecorder.OutputErrorRecorder _outputErrorRecorder;
private readonly ProcessContext _context;
public MySqlDestination(
string connStr,
ILogger logger,
IOptions<DatabaseOutputOptions> options,
ErrorRecorder.OutputErrorRecorder outputErrorRecorder,
ProcessContext context)
{
_conn = new MySqlConnection(connStr);
_conn.Open();
_recordCache = new Dictionary<string, IList<DataRecord>>();
_logger = logger;
_options = options;
_outputErrorRecorder = outputErrorRecorder;
_context = context;
}
public Task WriteRecordAsync(DataRecord record)
{
_recordCache.AddOrUpdate(record.TableName, [record], (_, value) =>
{
value.Add(record);
return value;
});
return Task.CompletedTask;
}
public async Task WriteRecordsAsync(IEnumerable<DataRecord> records)
{
foreach (var record in records)
{
await WriteRecordAsync(record);
}
}
public async Task FlushAsync(int maxAllowPacket)
{
if (_recordCache.Count == 0)
return;
var cmd = _conn.CreateCommand();
cmd.CommandTimeout = 3 * 60;
try
{
var excuseList = GetExcuseList(_recordCache, maxAllowPacket).ToList();
foreach (var insertSql in excuseList)
{
cmd.CommandText = insertSql;
try
{
await cmd.ExecuteNonQueryAsync();
}
catch (Exception e)
{
_logger.LogError(e, "插入数据库时发生错误, sql: {Sql}", cmd.CommandText.Omit(1000));
_context.AddException(e);
var match = MatchTableName().Match(cmd.CommandText);
if (match is { Success: true, Groups.Count: > 1 })
{
var tableName = match.Groups[1].Value;
await _outputErrorRecorder.LogErrorSqlAsync(cmd.CommandText, tableName, e);
}
else await _outputErrorRecorder.LogErrorSqlAsync(cmd.CommandText, e);
}
}
_recordCache.Clear();
}
catch (Exception e)
{
_logger.LogError(e, "序列化记录时发生错误");
throw;
}
finally
{
await cmd.DisposeAsync();
}
}
[GeneratedRegex("INSERT INTO `([^`]+)`")]
private static partial Regex MatchTableName();
public IEnumerable<string> GetExcuseList(IDictionary<string, IList<DataRecord>> tableRecords,int maxAllowPacket)
{
var sb = new StringBuilder("SET AUTOCOMMIT = 1;\n");
foreach (var (tableName, records) in tableRecords)
{
if (records.Count == 0)
continue;
var recordIdx = 0;
StartBuild:
var noCommas = true;
// INSERT INTO ... VALUES >>>
sb.Append($"INSERT INTO `{tableName}`(");
for (var i = 0; i < records[0].Headers.Count; i++)
{
var header = records[0].Headers[i];
sb.Append($"`{header}`");
if (i != records[0].Headers.Count - 1)
sb.Append(',');
}
sb.Append(") VALUES ");
// ([FIELDS]), >>>
for (;recordIdx < records.Count; recordIdx++)
{
var record = records[recordIdx];
var recordSb = new StringBuilder();
recordSb.Append('(');
for (var fieldIdx = 0; fieldIdx < record.Fields.Count; fieldIdx++)
{
var field = record.Fields[fieldIdx];
// 在这里处理特殊列
#region HandleFields
if (field == "\\N")
{
recordSb.Append("NULL");
goto Escape;
}
switch (_options.Value.GetColumnType(record.TableName, record.Headers[fieldIdx]))
{
case ColumnType.Text:
if(string.IsNullOrEmpty(field))
recordSb.Append("''");
else recordSb.Append($"_utf8mb4 0x{field}");
break;
case ColumnType.Blob:
if (string.IsNullOrEmpty(field))
recordSb.Append("''");
else recordSb.Append($"0x{field}");
break;
case ColumnType.Json:
if(string.IsNullOrEmpty(field))
recordSb.Append("'[]'"); // JObject or JArray?
else if (_options.Value.TreatJsonAsHex)
recordSb.Append($"_utf8mb4 0x{field}");
else recordSb.AppendLine(field);
break;
case ColumnType.UnDefine:
default:
recordSb.Append(field);
break;
}
Escape:
#endregion
if (fieldIdx != record.Fields.Count - 1)
recordSb.Append(',');
}
recordSb.Append(')');
// 若字符数量即将大于限制则返回SQL清空StringBuilder保留当前记录的索引值然后转到StartBuild标签重新开始一轮INSERT
if (sb.Length + recordSb.Length + 23 > maxAllowPacket)
{
sb.Append(';').AppendLine();
sb.Append("SET AUTOCOMMIT = 1;");
yield return sb.ToString();
sb.Clear();
goto StartBuild;
}
if (!noCommas)
sb.Append(',').AppendLine();
noCommas = false;
sb.Append(recordSb); // StringBuilder.Append(StringBuilder)不会分配多余的内存
}
sb.Append(';');
sb.Append("COMMIT;");
yield return sb.ToString();
sb.Clear();
}
}
public void Dispose()
{
_conn.Close();
_conn.Dispose();
}
public async ValueTask DisposeAsync()
{
await _conn.CloseAsync();
await _conn.DisposeAsync();
}
}

View File

@@ -0,0 +1,64 @@
using ConsoleApp2.Options;
using Microsoft.Extensions.Logging;
namespace ConsoleApp2.Services.ETL;
/// <summary>
/// 截取提供ZST文件中的第一行然后复制成指定数量的数据
/// </summary>
public class ZstMockReader : ZstReader
{
private long _currentCount;
private readonly long _mockCount;
private DataRecord? _template;
private readonly bool _deepCopy;
private readonly string[]? _autoIncrementColumn;
static readonly IReadOnlyList<int> Range = [500, 1500, 2500];
public ZstMockReader(TableMockConfig mockConfig, string filePath, string tableName, string[] headers, string delimiter = ",", char quoteChar = '\"', ILogger? logger = null) : base(filePath, tableName, headers, delimiter, quoteChar, logger)
{
_mockCount = mockConfig.MockCount;
_deepCopy = mockConfig.UseDeepCopy;
_autoIncrementColumn = mockConfig.AutoIncrementColumn;
}
public ZstMockReader(TableMockConfig mockConfig, Stream stream, string tableName, string[] headers, string delimiter = ",", char quoteChar = '\"', ILogger? logger = null) : base(stream, tableName, headers, delimiter, quoteChar, logger)
{
_mockCount = mockConfig.MockCount;
_deepCopy = mockConfig.UseDeepCopy;
_autoIncrementColumn = mockConfig.AutoIncrementColumn;
}
public override async ValueTask<bool> ReadAsync()
{
if (_template is null)
{
if (!await base.ReadAsync())
throw new InvalidOperationException("所提供的ZST源为空无法生成模板数据");
_template = Current.Clone() as DataRecord;
if (_template is null)
throw new ApplicationException("记录拷贝失败");
_currentCount++;
return true;
}
if (_deepCopy)
{
Current = _template.Clone() as DataRecord ?? throw new ApplicationException("记录拷贝失败");
if(_autoIncrementColumn is not null)
{
foreach (var column in _autoIncrementColumn)
{
Current[column] = (Convert.ToInt64(Current[column]) + 1).ToString();
_template = Current;
}
}
Current["CompanyID"] = Range[Random.Shared.Next(0, Range.Count)].ToString();//随机CompanyID
}
else Current = _template;
_currentCount++;
return _currentCount < _mockCount;
}
}

View File

@@ -0,0 +1,48 @@
using Microsoft.Extensions.Logging;
using ZstdSharp;
namespace ConsoleApp2.Services.ETL;
/// <summary>
/// 解压ZST文件从中读取CSV数据
/// </summary>
public class ZstReader : CsvReader
{
protected new readonly Lazy<StreamReader> Reader;
public ZstReader(string filePath, string tableName, string[] headers, string delimiter = ",", char quoteChar = '\"', ILogger? logger = null)
: base(filePath, tableName, headers, delimiter, quoteChar, logger)
{
var ds = new DecompressionStream(File.OpenRead(filePath));
Reader = new Lazy<StreamReader>(() => new StreamReader(ds));
}
public ZstReader(Stream stream, string tableName, string[] headers, string delimiter = ",", char quoteChar = '\"', ILogger? logger = null)
: base(stream, tableName, headers, delimiter, quoteChar, logger)
{
var ds = new DecompressionStream(stream);
Reader = new Lazy<StreamReader>(() => new StreamReader(ds));
}
public override async ValueTask<bool> ReadAsync()
{
var str = await Reader.Value.ReadLineAsync();
if (string.IsNullOrWhiteSpace(str))
return false;
CurrentRaw = str;
var fields = ParseRow(str, QuoteChar, Delimiter);
Current = new DataRecord(fields, TableName, Headers) {RawField = str};
return true;
}
public override void Dispose()
{
base.Dispose();
if(Reader.IsValueCreated)
Reader.Value.Dispose();
}
}