2024-01-15 17:26:44 +08:00
using System.Reflection.PortableExecutable ;
using System.Text ;
2024-01-12 16:50:37 +08:00
using System.Text.RegularExpressions ;
2023-12-29 16:16:05 +08:00
using ConsoleApp2.Helpers ;
2024-01-12 16:50:37 +08:00
using ConsoleApp2.HostedServices.Abstractions ;
2023-12-29 16:16:05 +08:00
using Microsoft.Extensions.Logging ;
namespace ConsoleApp2.Services ;
2024-01-04 09:00:44 +08:00
/// <summary>
/// CSV文件读取
/// </summary>
2024-01-12 16:50:37 +08:00
public class CsvSource : IDataSource
2023-12-29 16:16:05 +08:00
{
2024-01-12 16:50:37 +08:00
protected readonly string _inputDir ;
//protected readonly StreamReader _reader;
2023-12-29 16:16:05 +08:00
private readonly ILogger ? _logger ;
2024-01-12 16:50:37 +08:00
protected readonly string _tableName ;
2024-01-15 17:26:44 +08:00
protected string? _sqlFilePath ;
protected readonly string? _sqlFileText ;
protected string [ ] ? headers ;
protected string [ ] ? csvFiles ;
2024-01-12 16:50:37 +08:00
public string? CurrentRaw { get ; protected set ; }
2023-12-29 16:16:05 +08:00
public string Delimiter { get ; private set ; }
public char QuoteChar { get ; private set ; }
2024-01-12 16:50:37 +08:00
public CsvSource ( string inputDir , string tableName , string delimiter = "," , char quoteChar = '"' ,
2023-12-29 16:16:05 +08:00
ILogger ? logger = null )
{
2024-01-12 16:50:37 +08:00
_inputDir = inputDir ;
_tableName = tableName ;
2023-12-29 16:16:05 +08:00
_logger = logger ;
Delimiter = delimiter ;
QuoteChar = quoteChar ;
2024-01-12 16:50:37 +08:00
string pattern = $"^.*\\.{tableName}\\..*\\.sql$" ;
_sqlFilePath = Directory . GetFiles ( _inputDir ) . FirstOrDefault ( s = > Regex . Match ( s , pattern ) . Success ) ;
2023-12-29 16:16:05 +08:00
}
public string [ ] ParseRow ( string row , char quoteChar , string delimiter )
{
var span = row . AsSpan ( ) ;
var result = new List < string > ( ) ;
if ( span . Length = = 0 )
throw new ArgumentException ( "The row is empty" , nameof ( row ) ) ;
var isInQuote = span [ 0 ] = = quoteChar ;
var start = 0 ;
for ( var i = 1 ; i < span . Length ; i + + )
{
if ( span [ i ] = = quoteChar )
{
isInQuote = ! isInQuote ;
}
// delimiter需要足够复杂
else if ( /*!isInQuote && */ span . Length > i + delimiter . Length & & span [ i . . ( i + delimiter . Length ) ] . Equals ( delimiter , StringComparison . CurrentCulture ) ) // field matched
{
string field ;
if ( span [ start ] = = quoteChar & & span [ i - 1 ] = = quoteChar ) // enclosed by quoteChar
field = span [ ( start + 1 ) . . ( i - 1 ) ] . ToString ( ) ; // escape quoteChar
else
field = span [ start . . i ] . ToString ( ) ;
start = i + delimiter . Length ;
if ( field = = "\\N" )
field = "NULL" ;
result . Add ( field ) ;
continue ;
}
}
result . Add ( span [ start . . ] . ToString ( ) ) ;
for ( var i = 0 ; i < result . Count ; i + + )
{
var field = result [ i ] ;
if ( DumpDataHelper . CheckHexField ( field ) & & StringExtensions . CheckJsonHex ( field ) )
{
result [ i ] = StringExtensions . FromHex ( field ) ;
}
}
return result . ToArray ( ) ;
}
public string [ ] ParseRow2 ( ReadOnlySpan < char > source , char quoteChar , string delimiter )
{
var result = new List < string > ( ) ;
var index = - 1 ;
StringBuilder current = new StringBuilder ( ) ;
bool hasQuote = false ;
bool hasSlash = false ;
while ( index < source . Length - 1 )
{
index + + ;
if ( hasSlash = = false & & source [ index ] = = '\\' )
{
hasSlash = true ;
current . Append ( '\\' ) ;
continue ;
}
if ( hasSlash = = false & & source [ index ] = = quoteChar )
{
hasQuote = ! hasQuote ;
current . Append ( source [ index ] ) ;
continue ;
}
if ( hasQuote = = false & & source [ index ] = = delimiter [ 0 ] )
{
result . Add ( current . ToString ( ) ) ;
current . Clear ( ) ;
}
else
{
current . Append ( source [ index ] ) ;
}
hasSlash = false ;
}
result . Add ( current . ToString ( ) ) ;
return result . ToArray ( ) ;
}
2024-01-15 17:26:44 +08:00
public virtual async Task GetHeaderAndCsvFiles ( )
2024-01-12 16:50:37 +08:00
{
var text = await File . ReadAllTextAsync ( _sqlFilePath ) ;
2024-01-15 17:26:44 +08:00
headers = await DumpDataHelper . GetCsvHeadersFromSqlFileAsync ( text ) ;
csvFiles = await DumpDataHelper . GetCsvFileNamesFromSqlFileAsync ( text , new Regex ( @"'.+\.dat.zst'" ) ) ;
2024-01-12 16:50:37 +08:00
}
public virtual async Task DoEnqueue ( Action < DataRecord > action )
{
2024-01-15 17:26:44 +08:00
await GetHeaderAndCsvFiles ( ) ;
foreach ( var file in csvFiles )
2024-01-12 16:50:37 +08:00
{
var filePath = Path . Combine ( _inputDir , file ) ;
using ( var fs = File . OpenRead ( filePath ) )
{
using ( StreamReader sr = new StreamReader ( fs ) )
{
while ( ! sr . EndOfStream )
{
var line = await sr . ReadLineAsync ( ) ;
var fields = ParseRow2 ( line , QuoteChar , Delimiter ) ;
var record = new DataRecord ( fields , _tableName , headers ) ;
action ? . Invoke ( record ) ;
}
}
}
}
}
public virtual async Task < DataRecord ? > GetTestRecord ( )
{
2024-01-15 17:26:44 +08:00
await GetHeaderAndCsvFiles ( ) ;
var file = csvFiles . FirstOrDefault ( ) ;
2024-01-12 16:50:37 +08:00
if ( file ! = null )
{
var filePath = Path . Combine ( _inputDir , file ) ;
using ( var fs = File . OpenRead ( filePath ) )
{
using ( StreamReader sr = new StreamReader ( fs ) )
{
var line = await sr . ReadLineAsync ( ) ;
var fields = ParseRow2 ( line , QuoteChar , Delimiter ) ;
var record = new DataRecord ( fields , _tableName , headers ) ;
return record ;
}
}
}
return null ;
}
public void Dispose ( )
{
// _reader.Dispose();
}
2023-12-29 16:16:05 +08:00
}