2023-12-29 16:16:05 +08:00
using System.Text ;
2024-01-12 16:50:37 +08:00
using System.Text.RegularExpressions ;
2023-12-29 16:16:05 +08:00
using ConsoleApp2.Helpers ;
2024-01-12 16:50:37 +08:00
using ConsoleApp2.HostedServices.Abstractions ;
2023-12-29 16:16:05 +08:00
using Microsoft.Extensions.Logging ;
namespace ConsoleApp2.Services ;
2024-01-04 09:00:44 +08:00
/// <summary>
/// CSV文件读取
/// </summary>
2024-01-12 16:50:37 +08:00
public class CsvSource : IDataSource
2023-12-29 16:16:05 +08:00
{
2024-01-12 16:50:37 +08:00
protected readonly string _inputDir ;
//protected readonly StreamReader _reader;
2023-12-29 16:16:05 +08:00
private readonly ILogger ? _logger ;
2024-01-12 16:50:37 +08:00
protected readonly string _tableName ;
protected string _sqlFilePath ;
protected readonly string _sqlFileText ;
2023-12-29 16:16:05 +08:00
2024-01-12 16:50:37 +08:00
//public DataRecord Current { get; protected set; }
//public string[]? Headers { get; }
public string? CurrentRaw { get ; protected set ; }
2023-12-29 16:16:05 +08:00
public string Delimiter { get ; private set ; }
public char QuoteChar { get ; private set ; }
2024-01-12 16:50:37 +08:00
public CsvSource ( string inputDir , string tableName , string delimiter = "," , char quoteChar = '"' ,
2023-12-29 16:16:05 +08:00
ILogger ? logger = null )
{
2024-01-12 16:50:37 +08:00
_inputDir = inputDir ;
_tableName = tableName ;
//Headers = headers;
2023-12-29 16:16:05 +08:00
_logger = logger ;
Delimiter = delimiter ;
QuoteChar = quoteChar ;
2024-01-12 16:50:37 +08:00
//var fs = File.OpenRead(filePath);
//_reader = new StreamReader(fs);
//_tableName = DumpDataHelper.GetTableName(filePath);
string pattern = $"^.*\\.{tableName}\\..*\\.sql$" ;
_sqlFilePath = Directory . GetFiles ( _inputDir ) . FirstOrDefault ( s = > Regex . Match ( s , pattern ) . Success ) ;
2023-12-29 16:16:05 +08:00
}
2024-01-12 16:50:37 +08:00
//public virtual async ValueTask<bool> ReadAsync()
//{
// var str = await _reader.ReadLineAsync();
// if (string.IsNullOrWhiteSpace(str))
// return false;
2023-12-29 16:16:05 +08:00
2024-01-12 16:50:37 +08:00
// CurrentRaw = str;
2023-12-29 16:16:05 +08:00
2024-01-12 16:50:37 +08:00
// var fields = ParseRow2(str, QuoteChar, Delimiter);
// Current = new DataRecord(fields, _tableName, Headers);
// return true;
//}
2023-12-29 16:16:05 +08:00
public string [ ] ParseRow ( string row , char quoteChar , string delimiter )
{
var span = row . AsSpan ( ) ;
var result = new List < string > ( ) ;
if ( span . Length = = 0 )
throw new ArgumentException ( "The row is empty" , nameof ( row ) ) ;
var isInQuote = span [ 0 ] = = quoteChar ;
var start = 0 ;
for ( var i = 1 ; i < span . Length ; i + + )
{
if ( span [ i ] = = quoteChar )
{
isInQuote = ! isInQuote ;
}
// delimiter需要足够复杂
else if ( /*!isInQuote && */ span . Length > i + delimiter . Length & & span [ i . . ( i + delimiter . Length ) ] . Equals ( delimiter , StringComparison . CurrentCulture ) ) // field matched
{
string field ;
if ( span [ start ] = = quoteChar & & span [ i - 1 ] = = quoteChar ) // enclosed by quoteChar
field = span [ ( start + 1 ) . . ( i - 1 ) ] . ToString ( ) ; // escape quoteChar
else
field = span [ start . . i ] . ToString ( ) ;
start = i + delimiter . Length ;
if ( field = = "\\N" )
field = "NULL" ;
result . Add ( field ) ;
continue ;
}
}
result . Add ( span [ start . . ] . ToString ( ) ) ;
for ( var i = 0 ; i < result . Count ; i + + )
{
var field = result [ i ] ;
if ( DumpDataHelper . CheckHexField ( field ) & & StringExtensions . CheckJsonHex ( field ) )
{
result [ i ] = StringExtensions . FromHex ( field ) ;
}
}
return result . ToArray ( ) ;
}
public string [ ] ParseRow2 ( ReadOnlySpan < char > source , char quoteChar , string delimiter )
{
var result = new List < string > ( ) ;
var index = - 1 ;
StringBuilder current = new StringBuilder ( ) ;
bool hasQuote = false ;
bool hasSlash = false ;
while ( index < source . Length - 1 )
{
index + + ;
if ( hasSlash = = false & & source [ index ] = = '\\' )
{
hasSlash = true ;
current . Append ( '\\' ) ;
continue ;
}
if ( hasSlash = = false & & source [ index ] = = quoteChar )
{
hasQuote = ! hasQuote ;
current . Append ( source [ index ] ) ;
continue ;
}
if ( hasQuote = = false & & source [ index ] = = delimiter [ 0 ] )
{
result . Add ( current . ToString ( ) ) ;
current . Clear ( ) ;
}
else
{
current . Append ( source [ index ] ) ;
}
hasSlash = false ;
}
result . Add ( current . ToString ( ) ) ;
return result . ToArray ( ) ;
}
2024-01-12 16:50:37 +08:00
public virtual async Task < string [ ] > GetHeaders ( )
{
var text = await File . ReadAllTextAsync ( _sqlFilePath ) ;
return await DumpDataHelper . GetCsvHeadersFromSqlFileAsync ( text ) ;
}
public virtual async Task < string [ ] > GetCsvFiles ( )
{
var text = await File . ReadAllTextAsync ( _sqlFilePath ) ;
return await DumpDataHelper . GetCsvFileNamesFromSqlFileAsync ( text , new Regex ( @"'.+\.dat'" ) ) ;
}
public virtual async Task DoEnqueue ( Action < DataRecord > action )
{
var sourceFiles = await GetCsvFiles ( ) ;
foreach ( var file in sourceFiles )
{
var headers = await GetHeaders ( ) ;
var filePath = Path . Combine ( _inputDir , file ) ;
using ( var fs = File . OpenRead ( filePath ) )
{
using ( StreamReader sr = new StreamReader ( fs ) )
{
while ( ! sr . EndOfStream )
{
var line = await sr . ReadLineAsync ( ) ;
var fields = ParseRow2 ( line , QuoteChar , Delimiter ) ;
var record = new DataRecord ( fields , _tableName , headers ) ;
action ? . Invoke ( record ) ;
}
}
}
}
}
public virtual async Task < DataRecord ? > GetTestRecord ( )
{
var sourceFiles = await GetCsvFiles ( ) ;
var file = sourceFiles . FirstOrDefault ( ) ;
if ( file ! = null )
{
var headers = await GetHeaders ( ) ;
var filePath = Path . Combine ( _inputDir , file ) ;
using ( var fs = File . OpenRead ( filePath ) )
{
using ( StreamReader sr = new StreamReader ( fs ) )
{
var line = await sr . ReadLineAsync ( ) ;
var fields = ParseRow2 ( line , QuoteChar , Delimiter ) ;
var record = new DataRecord ( fields , _tableName , headers ) ;
return record ;
}
}
}
return null ;
}
public void Dispose ( )
{
// _reader.Dispose();
}
2023-12-29 16:16:05 +08:00
}