MES-ETL/ConsoleApp2/Services/CsvSource.cs

208 lines
6.5 KiB
C#
Raw Normal View History

2023-12-29 16:16:05 +08:00
using System.Text;
2024-01-12 16:50:37 +08:00
using System.Text.RegularExpressions;
2023-12-29 16:16:05 +08:00
using ConsoleApp2.Helpers;
2024-01-12 16:50:37 +08:00
using ConsoleApp2.HostedServices.Abstractions;
2023-12-29 16:16:05 +08:00
using Microsoft.Extensions.Logging;
namespace ConsoleApp2.Services;
2024-01-04 09:00:44 +08:00
/// <summary>
/// CSV文件读取
/// </summary>
2024-01-12 16:50:37 +08:00
public class CsvSource:IDataSource
2023-12-29 16:16:05 +08:00
{
2024-01-12 16:50:37 +08:00
protected readonly string _inputDir;
//protected readonly StreamReader _reader;
2023-12-29 16:16:05 +08:00
private readonly ILogger? _logger;
2024-01-12 16:50:37 +08:00
protected readonly string _tableName;
protected string _sqlFilePath;
protected readonly string _sqlFileText;
2023-12-29 16:16:05 +08:00
2024-01-12 16:50:37 +08:00
//public DataRecord Current { get; protected set; }
//public string[]? Headers { get; }
public string? CurrentRaw { get; protected set; }
2023-12-29 16:16:05 +08:00
public string Delimiter { get; private set; }
public char QuoteChar { get; private set; }
2024-01-12 16:50:37 +08:00
public CsvSource(string inputDir,string tableName,string delimiter = ",", char quoteChar = '"',
2023-12-29 16:16:05 +08:00
ILogger? logger = null)
{
2024-01-12 16:50:37 +08:00
_inputDir = inputDir;
_tableName = tableName;
//Headers = headers;
2023-12-29 16:16:05 +08:00
_logger = logger;
Delimiter = delimiter;
QuoteChar = quoteChar;
2024-01-12 16:50:37 +08:00
//var fs = File.OpenRead(filePath);
//_reader = new StreamReader(fs);
//_tableName = DumpDataHelper.GetTableName(filePath);
string pattern = $"^.*\\.{tableName}\\..*\\.sql$";
_sqlFilePath = Directory.GetFiles(_inputDir).FirstOrDefault(s => Regex.Match(s, pattern).Success);
2023-12-29 16:16:05 +08:00
}
2024-01-12 16:50:37 +08:00
//public virtual async ValueTask<bool> ReadAsync()
//{
// var str = await _reader.ReadLineAsync();
// if (string.IsNullOrWhiteSpace(str))
// return false;
2023-12-29 16:16:05 +08:00
2024-01-12 16:50:37 +08:00
// CurrentRaw = str;
2023-12-29 16:16:05 +08:00
2024-01-12 16:50:37 +08:00
// var fields = ParseRow2(str, QuoteChar, Delimiter);
// Current = new DataRecord(fields, _tableName, Headers);
// return true;
//}
2023-12-29 16:16:05 +08:00
public string[] ParseRow(string row, char quoteChar, string delimiter)
{
var span = row.AsSpan();
var result = new List<string>();
if (span.Length == 0)
throw new ArgumentException("The row is empty", nameof(row));
var isInQuote = span[0] == quoteChar;
var start = 0;
for (var i = 1; i < span.Length; i++)
{
if (span[i] == quoteChar)
{
isInQuote = !isInQuote;
}
// delimiter需要足够复杂
else if (/*!isInQuote && */span.Length > i + delimiter.Length && span[i..(i + delimiter.Length)].Equals(delimiter, StringComparison.CurrentCulture)) // field matched
{
string field;
if (span[start] == quoteChar && span[i - 1] == quoteChar) // enclosed by quoteChar
field = span[(start + 1)..(i - 1)].ToString(); // escape quoteChar
else
field = span[start..i].ToString();
start = i + delimiter.Length;
if (field == "\\N")
field = "NULL";
result.Add(field);
continue;
}
}
result.Add(span[start..].ToString());
for (var i = 0; i < result.Count; i++)
{
var field = result[i];
if (DumpDataHelper.CheckHexField(field) && StringExtensions.CheckJsonHex(field))
{
result[i] = StringExtensions.FromHex(field);
}
}
return result.ToArray();
}
public string[] ParseRow2(ReadOnlySpan<char> source, char quoteChar, string delimiter)
{
var result = new List<string>();
var index = -1;
StringBuilder current = new StringBuilder();
bool hasQuote = false;
bool hasSlash = false;
while (index < source.Length-1)
{
index++;
if (hasSlash == false && source[index] == '\\')
{
hasSlash = true;
current.Append('\\');
continue;
}
if (hasSlash ==false && source[index] == quoteChar)
{
hasQuote = !hasQuote;
current.Append(source[index]);
continue;
}
if (hasQuote==false && source[index] == delimiter[0])
{
result.Add(current.ToString());
current.Clear();
}
else
{
current.Append(source[index]);
}
hasSlash = false;
}
result.Add(current.ToString());
return result.ToArray();
}
2024-01-12 16:50:37 +08:00
public virtual async Task<string[]> GetHeaders()
{
var text = await File.ReadAllTextAsync(_sqlFilePath);
return await DumpDataHelper.GetCsvHeadersFromSqlFileAsync(text);
}
public virtual async Task<string[]> GetCsvFiles()
{
var text= await File.ReadAllTextAsync(_sqlFilePath);
return await DumpDataHelper.GetCsvFileNamesFromSqlFileAsync(text,new Regex(@"'.+\.dat'"));
}
public virtual async Task DoEnqueue(Action<DataRecord> action)
{
var sourceFiles =await GetCsvFiles();
foreach (var file in sourceFiles)
{
var headers = await GetHeaders();
var filePath= Path.Combine(_inputDir, file);
using (var fs = File.OpenRead(filePath))
{
using (StreamReader sr = new StreamReader(fs))
{
while (!sr.EndOfStream)
{
var line = await sr.ReadLineAsync();
var fields = ParseRow2(line, QuoteChar, Delimiter);
var record = new DataRecord(fields, _tableName, headers);
action?.Invoke(record);
}
}
}
}
}
public virtual async Task<DataRecord?> GetTestRecord()
{
var sourceFiles = await GetCsvFiles();
var file = sourceFiles.FirstOrDefault();
if (file != null)
{
var headers = await GetHeaders();
var filePath = Path.Combine(_inputDir, file);
using (var fs = File.OpenRead(filePath))
{
using (StreamReader sr = new StreamReader(fs))
{
var line = await sr.ReadLineAsync();
var fields = ParseRow2(line, QuoteChar, Delimiter);
var record = new DataRecord(fields, _tableName, headers);
return record;
}
}
}
return null;
}
public void Dispose()
{
// _reader.Dispose();
}
2023-12-29 16:16:05 +08:00
}