Csv解析性能优化
This commit is contained in:
parent
8db7c71170
commit
41a1dc8a4f
@ -49,43 +49,44 @@ public class CsvReader : IDataReader
|
|||||||
if (string.IsNullOrWhiteSpace(str))
|
if (string.IsNullOrWhiteSpace(str))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
var fields = ParseRow(str, QuoteChar, Delimiter);
|
var fields = ParseRow(str, QuoteChar, Delimiter[0]);
|
||||||
Current = new DataRecord(fields, TableName, Headers);
|
Current = new DataRecord(fields, TableName, Headers);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public string[] ParseRow(ReadOnlySpan<char> source, char quoteChar, string delimiter)
|
public static string[] ParseRow(ReadOnlySpan<char> source, char quoteChar, char delimiter)
|
||||||
{
|
{
|
||||||
var result = new List<string>();
|
var result = new List<string>();
|
||||||
var index = -1;
|
var index = -1;
|
||||||
var current = new StringBuilder();
|
var current = new StringBuilder(source.Length);
|
||||||
var hasQuote = false;
|
var hasQuote = false;
|
||||||
var hasSlash = false;
|
var hasSlash = false;
|
||||||
while (index < source.Length - 1)
|
while (index < source.Length - 1)
|
||||||
{
|
{
|
||||||
index++;
|
index++;
|
||||||
if (hasSlash == false && source[index] == '\\')
|
var currChar = source[index];
|
||||||
|
if (hasSlash == false && currChar == '\\')
|
||||||
{
|
{
|
||||||
hasSlash = true;
|
hasSlash = true;
|
||||||
current.Append('\\');
|
current.Append('\\');
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hasSlash == false && source[index] == quoteChar)
|
if (hasSlash == false && currChar == quoteChar)
|
||||||
{
|
{
|
||||||
hasQuote = !hasQuote;
|
hasQuote = !hasQuote;
|
||||||
current.Append(source[index]);
|
current.Append(currChar);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hasQuote == false && source[index] == delimiter[0])
|
if (hasQuote == false && currChar == delimiter)
|
||||||
{
|
{
|
||||||
result.Add(current.ToString());
|
result.Add(current.ToString());
|
||||||
current.Clear();
|
current.Clear();
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
current.Append(source[index]);
|
current.Append(currChar);
|
||||||
}
|
}
|
||||||
|
|
||||||
hasSlash = false;
|
hasSlash = false;
|
||||||
@ -94,6 +95,54 @@ public class CsvReader : IDataReader
|
|||||||
result.Add(current.ToString());
|
result.Add(current.ToString());
|
||||||
return result.ToArray();
|
return result.ToArray();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static List<string> ParseRowFaster(ReadOnlySpan<char> source, char quoteChar, char delimiter, int columnCount = 10)
|
||||||
|
{
|
||||||
|
var result = new List<string>(columnCount);
|
||||||
|
var index = -1;
|
||||||
|
var hasQuote = false;
|
||||||
|
var hasSlash = false;
|
||||||
|
var start = 0;
|
||||||
|
var end = 0;
|
||||||
|
var len = source.Length - 1;
|
||||||
|
while (index < len)
|
||||||
|
{
|
||||||
|
++index;
|
||||||
|
var currChar = source[index];
|
||||||
|
|
||||||
|
if (!hasSlash)
|
||||||
|
{
|
||||||
|
if (currChar is '\\')
|
||||||
|
{
|
||||||
|
hasSlash = true;
|
||||||
|
++end;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (currChar == quoteChar)
|
||||||
|
{
|
||||||
|
hasQuote = !hasQuote;
|
||||||
|
++end;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!hasQuote && currChar == delimiter)
|
||||||
|
{
|
||||||
|
result.Add(source[start..(end + 1)].ToString());
|
||||||
|
start = end + 2;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
++end;
|
||||||
|
}
|
||||||
|
|
||||||
|
hasSlash = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
result.Add(source[start..(end + 1)].ToString());
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
public virtual void Dispose()
|
public virtual void Dispose()
|
||||||
{
|
{
|
||||||
|
@ -31,7 +31,7 @@ public class ZstReader : CsvReader
|
|||||||
if (string.IsNullOrWhiteSpace(str))
|
if (string.IsNullOrWhiteSpace(str))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
var fields = ParseRow(str, QuoteChar, Delimiter);
|
var fields = ParseRow(str, QuoteChar, Delimiter[0]);
|
||||||
Current = new DataRecord(fields, TableName, Headers);
|
Current = new DataRecord(fields, TableName, Headers);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"Input":{
|
"Input":{
|
||||||
"InputDir": "D:\\Dump\\MyDumper-ZST 2024-02-05", // Csv数据输入目录
|
"InputDir": "D:\\Dump\\NewMockData", // Csv数据输入目录
|
||||||
"UseMock": false, // 使用模拟数据进行测试
|
"UseMock": false, // 使用模拟数据进行测试
|
||||||
"MockCountMultiplier": 1 // 模拟数据量级的乘数
|
"MockCountMultiplier": 1 // 模拟数据量级的乘数
|
||||||
},
|
},
|
||||||
|
74
MesETL.Test/Test.cs
Normal file
74
MesETL.Test/Test.cs
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
using System.Diagnostics;
|
||||||
|
using MesETL.App.Services.ETL;
|
||||||
|
using Xunit.Abstractions;
|
||||||
|
using ZstdSharp;
|
||||||
|
|
||||||
|
namespace TestProject1;
|
||||||
|
|
||||||
|
public class Test
|
||||||
|
{
|
||||||
|
private readonly ITestOutputHelper _output;
|
||||||
|
|
||||||
|
public Test(ITestOutputHelper output)
|
||||||
|
{
|
||||||
|
_output = output;
|
||||||
|
}
|
||||||
|
|
||||||
|
[Theory]
|
||||||
|
[InlineData([@"D:\Dump\NewMockData2\cferp.order_box_block.00000.dat.zst"])]
|
||||||
|
public async Task ZstdDecompressTest(string inputFile)
|
||||||
|
{
|
||||||
|
var count = 0;
|
||||||
|
var flag = true;
|
||||||
|
var sw = Stopwatch.StartNew();
|
||||||
|
var reader = new StreamReader(new DecompressionStream(File.OpenRead(inputFile)));
|
||||||
|
var monitor = Task.Run(async () =>
|
||||||
|
{
|
||||||
|
var lastElapse = sw.ElapsedMilliseconds;
|
||||||
|
var lastCount = 0;
|
||||||
|
while (flag)
|
||||||
|
{
|
||||||
|
await Task.Delay(2000);
|
||||||
|
|
||||||
|
_output.WriteLine($"speed: {(count - lastCount) / ((sw.ElapsedMilliseconds - lastElapse) / 1000f)}");
|
||||||
|
lastElapse = sw.ElapsedMilliseconds;
|
||||||
|
lastCount = count;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
while (!reader.EndOfStream)
|
||||||
|
{
|
||||||
|
var str = await reader.ReadLineAsync();
|
||||||
|
// char a;
|
||||||
|
// foreach (var c in str)
|
||||||
|
// {
|
||||||
|
// a = c;
|
||||||
|
// }
|
||||||
|
CsvReader.ParseRowFaster(str, '"', ',');
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
flag = false;
|
||||||
|
monitor.Wait();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static IEnumerable<object[]> ParseRowData()
|
||||||
|
{
|
||||||
|
yield return
|
||||||
|
[@"20220104020855,""2022-01-04 10:06:46"",1455,""0001-01-01 00:00:00"",""1"",0,""2"",""0"",\N,""0"",22010"];
|
||||||
|
yield return
|
||||||
|
[@"20220104020858,""2022-01-04 15:08:22"",1455,""0001-01-01 00:00:00"",""1"",838,""2"",""0"",""5"",""0"",22010"];
|
||||||
|
yield return
|
||||||
|
[@"5586326,20220104020855,220105981029,""1"",482278,482279,3768774,0,0,""1.000"",1455,22010"];
|
||||||
|
yield return
|
||||||
|
[@"130658,""PD220104002302"",3,4616,""2022-01-04 15:10:40"",1443,""2022-01-04 15:10:40"",""2022-01-04 15:10:51"",0,"""",0,1455,""0001-01-01 00:00:00"",1,5B32303232303130343032303835385D,E590B8E5A1912D2DE590B8E5A1912D2D31382D2D323030302A3630302D2D3130E789872D2D352E3936333B6361696C69616F2D2D79616E73652D2D392D2D323031302A313137342D2D31E789872D2D322E3336,""0"",0"];
|
||||||
|
}
|
||||||
|
[Theory]
|
||||||
|
[MemberData(nameof(ParseRowData))]
|
||||||
|
public void ParseRowFasterTest(string row)
|
||||||
|
{
|
||||||
|
var fields = CsvReader.ParseRow(row, '"', ',');
|
||||||
|
_output.WriteLine(string.Join(',', fields));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user