改进ZstReader读取方法,大幅优化内存性能

This commit is contained in:
陈梓阳 2024-12-11 18:08:16 +08:00
parent 0e28d639c1
commit b34ac104ef

View File

@ -1,4 +1,6 @@
using Microsoft.Extensions.Logging; using System.Runtime.InteropServices;
using System.Text;
using Microsoft.Extensions.Logging;
using ZstdSharp; using ZstdSharp;
namespace MesETL.App.Services.ETL; namespace MesETL.App.Services.ETL;
@ -11,7 +13,12 @@ public class ZstReader : CsvReader
protected new readonly Lazy<StreamReader> Reader; protected new readonly Lazy<StreamReader> Reader;
private Stream? _stream; private Stream? _stream;
private readonly List<char> _str = new(1024);
private readonly char[] _charBuffer = new char[1024];
private int _charLen = 0;
private int _charPos = 0;
public ZstReader(string filePath, string tableName, string[] headers, string delimiter = ",", char quoteChar = '\"', ILogger? logger = null) public ZstReader(string filePath, string tableName, string[] headers, string delimiter = ",", char quoteChar = '\"', ILogger? logger = null)
: base(filePath, tableName, headers, delimiter, quoteChar, logger) : base(filePath, tableName, headers, delimiter, quoteChar, logger)
{ {
@ -20,6 +27,7 @@ public class ZstReader : CsvReader
_stream = new DecompressionStream(File.OpenRead(filePath)); _stream = new DecompressionStream(File.OpenRead(filePath));
return new StreamReader(_stream); return new StreamReader(_stream);
}, false); }, false);
ReadBuffer();
} }
public ZstReader(Stream stream, string tableName, string[] headers, string delimiter = ",", char quoteChar = '\"', ILogger? logger = null) public ZstReader(Stream stream, string tableName, string[] headers, string delimiter = ",", char quoteChar = '\"', ILogger? logger = null)
@ -27,16 +35,56 @@ public class ZstReader : CsvReader
{ {
var ds = new DecompressionStream(stream); var ds = new DecompressionStream(stream);
Reader = new Lazy<StreamReader>(() => new StreamReader(ds), false); Reader = new Lazy<StreamReader>(() => new StreamReader(ds), false);
ReadBuffer();
}
private int ReadBuffer()
{
_charLen = _charPos = 0;
_charLen = Reader.Value.ReadBlock(_charBuffer);
return _charLen;
} }
public override async ValueTask<bool> ReadAsync() public override async ValueTask<bool> ReadAsync()
{ {
var str = await Reader.Value.ReadLineAsync(); // 缓冲区已经读取完毕并且流状态为EOF
if (string.IsNullOrWhiteSpace(str)) if (_charPos == _charLen && ReadBuffer() == 0)
return false; return false;
var fields = ParseRowFaster(str, QuoteChar, Delimiter[0]); do
Current = new DataRecord(fields, TableName, Headers); {
// 读取缓冲区
var span = _charBuffer.AsSpan(_charPos, _charLen - _charPos);
var newLineIdx = span.IndexOfAny('\r', '\n');
// 读取到行,结合当前构建字符串转换进行转换
if (newLineIdx >= 0)
{
if (_str.Count == 0) // => 可以提高一点性能...
{
var fields = ParseRowFaster(span[..newLineIdx], QuoteChar, Delimiter[0]);
Current = new DataRecord(fields, TableName, Headers);
}
else
{
_str.AddRange(span[..newLineIdx]);
var fields = ParseRowFaster(CollectionsMarshal.AsSpan(_str), QuoteChar, Delimiter[0]);
Current = new DataRecord(fields, TableName, Headers);
}
_str.Clear();
var ch = span[newLineIdx];
_charPos += newLineIdx + 1;
if (ch == '\r' && (_charPos < _charLen || ReadBuffer() > 0) && _charBuffer[_charPos] == '\n') // 跳过CRLF
++_charPos;
return true;
}
// 未读取到行,将缓冲区插入构建字符串
_str.AddRange(span);
} while (ReadBuffer() > 0);
var f = ParseRowFaster(CollectionsMarshal.AsSpan(_str), QuoteChar, Delimiter[0]);
Current = new DataRecord(f, TableName, Headers);
_str.Clear();
return true; return true;
} }