2024-02-06 15:37:21 +08:00
|
|
|
|
using System.Text.Json;
|
|
|
|
|
using System.Text.RegularExpressions;
|
2024-12-10 14:03:09 +08:00
|
|
|
|
using MesETL.App.HostedServices;
|
|
|
|
|
using Serilog;
|
2024-01-29 09:29:16 +08:00
|
|
|
|
using ZstdSharp;
|
2023-12-28 15:18:03 +08:00
|
|
|
|
|
2024-02-02 17:14:41 +08:00
|
|
|
|
namespace MesETL.App.Helpers;
|
2023-12-28 15:18:03 +08:00
|
|
|
|
|
|
|
|
|
public static partial class DumpDataHelper
|
|
|
|
|
{
|
|
|
|
|
[GeneratedRegex(@"'.+\.dat'")]
|
|
|
|
|
private static partial Regex MatchDatFile();
|
|
|
|
|
[GeneratedRegex(@"\([^)]*\)")]
|
|
|
|
|
private static partial Regex MatchBrackets();
|
|
|
|
|
|
2024-01-29 09:29:16 +08:00
|
|
|
|
/// <summary>
|
|
|
|
|
/// 从MyDumper导出的SQL文件内容中读取表头
|
|
|
|
|
/// </summary>
|
|
|
|
|
/// <param name="content"></param>
|
|
|
|
|
/// <returns></returns>
|
|
|
|
|
/// <exception cref="ArgumentException"></exception>
|
|
|
|
|
public static string[] GetCsvHeadersFromSqlFile(string content)
|
2023-12-28 15:18:03 +08:00
|
|
|
|
{
|
2024-01-29 09:29:16 +08:00
|
|
|
|
var match = MatchBrackets().Match(content);
|
|
|
|
|
if (!match.Success)
|
|
|
|
|
throw new ArgumentException("输入的SQL内容有误,无法提取表头", nameof(content));
|
2023-12-28 15:18:03 +08:00
|
|
|
|
return ParseHeader(match.ValueSpan);
|
2024-01-29 09:29:16 +08:00
|
|
|
|
|
|
|
|
|
string[] ParseHeader(ReadOnlySpan<char> headerStr)
|
2023-12-28 15:18:03 +08:00
|
|
|
|
{
|
2024-01-29 09:29:16 +08:00
|
|
|
|
headerStr = headerStr[1..^1];
|
2024-12-10 14:03:09 +08:00
|
|
|
|
var headers = new List<string>();
|
|
|
|
|
foreach (var range in headerStr.Split(','))
|
2024-01-29 09:29:16 +08:00
|
|
|
|
{
|
2024-12-10 14:03:09 +08:00
|
|
|
|
headers.Add(headerStr[range].Trim("@`").ToString()); // 消除列名的反引号,如果是变量则消除@
|
2024-01-29 09:29:16 +08:00
|
|
|
|
}
|
2024-12-10 14:03:09 +08:00
|
|
|
|
|
|
|
|
|
return headers.ToArray();
|
2024-01-29 09:29:16 +08:00
|
|
|
|
}
|
2023-12-28 15:18:03 +08:00
|
|
|
|
}
|
|
|
|
|
|
2024-01-29 09:29:16 +08:00
|
|
|
|
/// <summary>
|
|
|
|
|
/// 从MyDumper导出的Csv文件名解析出表名
|
|
|
|
|
/// </summary>
|
|
|
|
|
/// <param name="filePath"></param>
|
|
|
|
|
/// <returns></returns>
|
2024-12-10 14:03:09 +08:00
|
|
|
|
[Obsolete("用ParseMyDumperFile替代")]
|
2024-01-29 09:29:16 +08:00
|
|
|
|
public static string GetTableNameFromCsvFileName(ReadOnlySpan<char> filePath)
|
2023-12-28 15:18:03 +08:00
|
|
|
|
{
|
|
|
|
|
filePath = filePath[(filePath.LastIndexOf('\\') + 1)..];
|
|
|
|
|
var firstDotIdx = -1;
|
|
|
|
|
var secondDotIdx = -1;
|
|
|
|
|
var times = 0;
|
|
|
|
|
for (var i = 0; i < filePath.Length; i++)
|
|
|
|
|
{
|
|
|
|
|
if (filePath[i] == '.')
|
|
|
|
|
{
|
|
|
|
|
++times;
|
|
|
|
|
if(times == 1)
|
|
|
|
|
firstDotIdx = i;
|
|
|
|
|
if (times == 2)
|
|
|
|
|
{
|
|
|
|
|
secondDotIdx = i;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return filePath[(firstDotIdx+1)..secondDotIdx].ToString();
|
|
|
|
|
}
|
2024-12-10 14:03:09 +08:00
|
|
|
|
|
|
|
|
|
public enum MyDumperFileType { Dat, Sql }
|
|
|
|
|
|
|
|
|
|
public record MyDumperFileMeta(string Path, string Database, string TableName, int Index, MyDumperFileType Type);
|
|
|
|
|
|
|
|
|
|
public static MyDumperFileMeta ParseMyDumperFile(ReadOnlySpan<char> path)
|
|
|
|
|
{
|
|
|
|
|
try
|
|
|
|
|
{
|
|
|
|
|
var fileName = Path.GetFileName(path).ToString();
|
|
|
|
|
var parts = fileName.Split('.');
|
|
|
|
|
var type = parts[3] switch
|
|
|
|
|
{
|
|
|
|
|
"dat" => MyDumperFileType.Dat,
|
|
|
|
|
"sql" => MyDumperFileType.Sql,
|
|
|
|
|
_ => throw new ArgumentException("不支持的MyDumper文件类型", nameof(path))
|
|
|
|
|
};
|
|
|
|
|
return new MyDumperFileMeta(path.ToString(), parts[0], parts[1], int.Parse(parts[2]), type);
|
|
|
|
|
}
|
|
|
|
|
catch (Exception e)
|
|
|
|
|
{
|
|
|
|
|
throw new ArgumentException($"此文件不是MyDumper导出的文件 {path}", nameof(path), e);
|
|
|
|
|
}
|
|
|
|
|
}
|
2023-12-28 15:18:03 +08:00
|
|
|
|
|
2024-01-29 09:29:16 +08:00
|
|
|
|
/// <summary>
|
|
|
|
|
/// 从MyDumper导出的SQL文件内容中读取CSV文件名
|
|
|
|
|
/// </summary>
|
|
|
|
|
/// <param name="txt"></param>
|
|
|
|
|
/// <param name="regex"></param>
|
|
|
|
|
/// <returns></returns>
|
|
|
|
|
public static Task<string[]> GetCsvFileNamesFromSqlFileAsync(string txt, Regex regex)
|
2023-12-28 15:18:03 +08:00
|
|
|
|
{
|
2024-01-12 16:50:37 +08:00
|
|
|
|
//var txt = await File.ReadAllTextAsync(filePath);
|
|
|
|
|
var matches = regex.Matches(txt);
|
2024-01-29 09:29:16 +08:00
|
|
|
|
return Task.FromResult(matches.Select(match => match.ValueSpan[1..^1].ToString()).ToArray());
|
2023-12-28 15:18:03 +08:00
|
|
|
|
}
|
|
|
|
|
|
2024-01-29 09:29:16 +08:00
|
|
|
|
/// <summary>
|
|
|
|
|
/// 检查字符串是否为16进制
|
|
|
|
|
/// </summary>
|
|
|
|
|
/// <param name="str"></param>
|
|
|
|
|
/// <returns></returns>
|
2023-12-28 15:18:03 +08:00
|
|
|
|
public static bool CheckHexField(string? str)
|
|
|
|
|
{
|
|
|
|
|
if (string.IsNullOrWhiteSpace(str))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (str.StartsWith('\"'))
|
|
|
|
|
return false;
|
|
|
|
|
|
2023-12-29 16:16:05 +08:00
|
|
|
|
var isAllDigit = true;
|
2023-12-28 15:18:03 +08:00
|
|
|
|
|
|
|
|
|
foreach (var c in str)
|
|
|
|
|
{
|
|
|
|
|
if (!char.IsAsciiHexDigit(c))
|
|
|
|
|
return false;
|
|
|
|
|
if (!char.IsNumber(c))
|
2023-12-29 16:16:05 +08:00
|
|
|
|
isAllDigit = false;
|
2023-12-28 15:18:03 +08:00
|
|
|
|
}
|
|
|
|
|
|
2023-12-29 16:16:05 +08:00
|
|
|
|
if (isAllDigit) //避免全数字
|
2023-12-28 15:18:03 +08:00
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2024-01-29 09:29:16 +08:00
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
/// 将输入流以ZSTD标准解压为字符串
|
|
|
|
|
/// </summary>
|
|
|
|
|
/// <param name="stream"></param>
|
|
|
|
|
/// <returns></returns>
|
|
|
|
|
public static async Task<string> DecompressZstAsStringAsync(Stream stream)
|
|
|
|
|
{
|
|
|
|
|
await using var ds = new DecompressionStream(stream);
|
|
|
|
|
var reader = new StreamReader(ds);
|
|
|
|
|
return await reader.ReadToEndAsync();
|
|
|
|
|
}
|
2024-12-10 14:03:09 +08:00
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
/// 适用于文件输入服务以及MyDumper Zst导出目录的文件元数据构建函数
|
|
|
|
|
/// </summary>
|
|
|
|
|
/// <param name="filePath"></param>
|
|
|
|
|
/// <returns></returns>
|
|
|
|
|
/// <exception cref="ApplicationException"></exception>
|
|
|
|
|
public static FileInputInfo? MyDumperFileInputMetaBuilder(string filePath)
|
2024-02-06 15:37:21 +08:00
|
|
|
|
{
|
2024-12-10 14:03:09 +08:00
|
|
|
|
// 只查找后缀为.dat.zst的文件
|
|
|
|
|
if (!filePath.EndsWith(".dat.zst")) return null;
|
|
|
|
|
|
|
|
|
|
var fileMeta = ParseMyDumperFile(filePath);
|
|
|
|
|
var inputDir = Path.GetDirectoryName(filePath);
|
|
|
|
|
string[]? headers;
|
2024-02-06 15:37:21 +08:00
|
|
|
|
try
|
|
|
|
|
{
|
2024-12-10 14:03:09 +08:00
|
|
|
|
// 查找同目录下同表的SQL文件
|
|
|
|
|
var sqlFile = Directory.GetFiles(inputDir!)
|
|
|
|
|
.SingleOrDefault(f => f.Equals(filePath.Replace(".dat.zst", ".sql.zst")));
|
|
|
|
|
if (sqlFile is null)
|
|
|
|
|
{
|
|
|
|
|
Log.Debug("{TableName}表的SQL文件不存在", fileMeta.TableName);
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
headers = GetCsvHeadersFromSqlFile(
|
|
|
|
|
DecompressZstAsStringAsync(File.OpenRead(sqlFile)).Result);
|
2024-02-06 15:37:21 +08:00
|
|
|
|
}
|
2024-12-10 14:03:09 +08:00
|
|
|
|
catch (InvalidOperationException e)
|
2024-02-06 15:37:21 +08:00
|
|
|
|
{
|
2024-12-10 14:03:09 +08:00
|
|
|
|
throw new ApplicationException($"目录下不止一个{fileMeta.TableName}表的SQL文件", e);
|
2024-02-06 15:37:21 +08:00
|
|
|
|
}
|
2024-12-10 14:03:09 +08:00
|
|
|
|
|
|
|
|
|
return new FileInputInfo
|
|
|
|
|
{
|
|
|
|
|
FileName = filePath,
|
|
|
|
|
TableName = fileMeta.TableName,
|
|
|
|
|
Headers = headers,
|
|
|
|
|
Database = fileMeta.Database,
|
|
|
|
|
Part = fileMeta.Index
|
|
|
|
|
};
|
2024-02-06 15:37:21 +08:00
|
|
|
|
}
|
2023-12-28 15:18:03 +08:00
|
|
|
|
}
|