184 lines
5.7 KiB
C#
184 lines
5.7 KiB
C#
using System.Reflection.PortableExecutable;
|
|
using System.Text;
|
|
using System.Text.RegularExpressions;
|
|
using ConsoleApp2.Helpers;
|
|
using ConsoleApp2.HostedServices.Abstractions;
|
|
using Microsoft.Extensions.Logging;
|
|
|
|
namespace ConsoleApp2.Services;
|
|
|
|
/// <summary>
|
|
/// CSV文件读取
|
|
/// </summary>
|
|
public class CsvSource:IDataSource
|
|
{
|
|
protected readonly string _inputDir;
|
|
//protected readonly StreamReader _reader;
|
|
private readonly ILogger? _logger;
|
|
protected readonly string _tableName;
|
|
protected string? _sqlFilePath;
|
|
protected readonly string? _sqlFileText;
|
|
protected string[]? headers;
|
|
protected string[]? csvFiles;
|
|
public string? CurrentRaw { get; protected set; }
|
|
public string Delimiter { get; private set; }
|
|
public char QuoteChar { get; private set; }
|
|
public CsvSource(string inputDir,string tableName,string delimiter = ",", char quoteChar = '"',
|
|
ILogger? logger = null)
|
|
{
|
|
_inputDir = inputDir;
|
|
_tableName = tableName;
|
|
_logger = logger;
|
|
Delimiter = delimiter;
|
|
QuoteChar = quoteChar;
|
|
string pattern = $"^.*\\.{tableName}\\..*\\.sql$";
|
|
_sqlFilePath = Directory.GetFiles(_inputDir).FirstOrDefault(s => Regex.Match(s, pattern).Success);
|
|
|
|
|
|
}
|
|
|
|
|
|
public string[] ParseRow(string row, char quoteChar, string delimiter)
|
|
{
|
|
var span = row.AsSpan();
|
|
var result = new List<string>();
|
|
|
|
if (span.Length == 0)
|
|
throw new ArgumentException("The row is empty", nameof(row));
|
|
|
|
var isInQuote = span[0] == quoteChar;
|
|
var start = 0;
|
|
for (var i = 1; i < span.Length; i++)
|
|
{
|
|
if (span[i] == quoteChar)
|
|
{
|
|
isInQuote = !isInQuote;
|
|
}
|
|
// delimiter需要足够复杂
|
|
else if (/*!isInQuote && */span.Length > i + delimiter.Length && span[i..(i + delimiter.Length)].Equals(delimiter, StringComparison.CurrentCulture)) // field matched
|
|
{
|
|
string field;
|
|
if (span[start] == quoteChar && span[i - 1] == quoteChar) // enclosed by quoteChar
|
|
field = span[(start + 1)..(i - 1)].ToString(); // escape quoteChar
|
|
else
|
|
field = span[start..i].ToString();
|
|
|
|
start = i + delimiter.Length;
|
|
|
|
if (field == "\\N")
|
|
field = "NULL";
|
|
|
|
result.Add(field);
|
|
|
|
continue;
|
|
}
|
|
}
|
|
|
|
result.Add(span[start..].ToString());
|
|
|
|
|
|
for (var i = 0; i < result.Count; i++)
|
|
{
|
|
var field = result[i];
|
|
if (DumpDataHelper.CheckHexField(field) && StringExtensions.CheckJsonHex(field))
|
|
{
|
|
result[i] = StringExtensions.FromHex(field);
|
|
}
|
|
}
|
|
|
|
return result.ToArray();
|
|
}
|
|
|
|
public string[] ParseRow2(ReadOnlySpan<char> source, char quoteChar, string delimiter)
|
|
{
|
|
var result = new List<string>();
|
|
var index = -1;
|
|
StringBuilder current = new StringBuilder();
|
|
bool hasQuote = false;
|
|
bool hasSlash = false;
|
|
while (index < source.Length-1)
|
|
{
|
|
index++;
|
|
if (hasSlash == false && source[index] == '\\')
|
|
{
|
|
hasSlash = true;
|
|
current.Append('\\');
|
|
continue;
|
|
}
|
|
if (hasSlash ==false && source[index] == quoteChar)
|
|
{
|
|
hasQuote = !hasQuote;
|
|
current.Append(source[index]);
|
|
continue;
|
|
}
|
|
if (hasQuote==false && source[index] == delimiter[0])
|
|
{
|
|
result.Add(current.ToString());
|
|
current.Clear();
|
|
}
|
|
else
|
|
{
|
|
current.Append(source[index]);
|
|
}
|
|
|
|
hasSlash = false;
|
|
}
|
|
|
|
result.Add(current.ToString());
|
|
return result.ToArray();
|
|
}
|
|
public virtual async Task GetHeaderAndCsvFiles()
|
|
{
|
|
var text = await File.ReadAllTextAsync(_sqlFilePath);
|
|
headers = await DumpDataHelper.GetCsvHeadersFromSqlFileAsync(text);
|
|
csvFiles = await DumpDataHelper.GetCsvFileNamesFromSqlFileAsync(text, new Regex(@"'.+\.dat.zst'"));
|
|
|
|
}
|
|
public virtual async Task DoEnqueue(Action<DataRecord> action)
|
|
{
|
|
await GetHeaderAndCsvFiles();
|
|
foreach (var file in csvFiles)
|
|
{
|
|
var filePath= Path.Combine(_inputDir, file);
|
|
using (var fs = File.OpenRead(filePath))
|
|
{
|
|
using (StreamReader sr = new StreamReader(fs))
|
|
{
|
|
while (!sr.EndOfStream)
|
|
{
|
|
var line = await sr.ReadLineAsync();
|
|
var fields = ParseRow2(line, QuoteChar, Delimiter);
|
|
var record = new DataRecord(fields, _tableName, headers);
|
|
action?.Invoke(record);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
}
|
|
}
|
|
public virtual async Task<DataRecord?> GetTestRecord()
|
|
{
|
|
await GetHeaderAndCsvFiles();
|
|
var file = csvFiles.FirstOrDefault();
|
|
if (file != null)
|
|
{
|
|
var filePath = Path.Combine(_inputDir, file);
|
|
using (var fs = File.OpenRead(filePath))
|
|
{
|
|
using (StreamReader sr = new StreamReader(fs))
|
|
{
|
|
var line = await sr.ReadLineAsync();
|
|
var fields = ParseRow2(line, QuoteChar, Delimiter);
|
|
var record = new DataRecord(fields, _tableName, headers);
|
|
return record;
|
|
}
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
public void Dispose()
|
|
{
|
|
// _reader.Dispose();
|
|
}
|
|
} |