From 44ae5a28ef120d6b1ee6edbc2e42273b8a7de51e Mon Sep 17 00:00:00 2001 From: huby2358 Date: Thu, 7 Nov 2024 11:55:24 +0800 Subject: [PATCH] refactor: reduce memory alloc in makeBatchRows (#19701) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 之前一个pr将makeBatchRows这里batch提前分配8192行内存(实际可能并不需要这么多),改为按照实际数据分配 Approved by: @ouyuanning, @m-schen, @heni02, @aressu1985 --- pkg/sql/colexec/external/external.go | 113 +++++++----------- .../cases/load_data/load_data.result | 5 + .../distributed/cases/load_data/load_data.sql | 4 + .../resources/load_data/integer_numbers_4.csv | 1 + 4 files changed, 54 insertions(+), 69 deletions(-) create mode 100644 test/distributed/resources/load_data/integer_numbers_4.csv diff --git a/pkg/sql/colexec/external/external.go b/pkg/sql/colexec/external/external.go index af9f660e46d3..098c4a112285 100644 --- a/pkg/sql/colexec/external/external.go +++ b/pkg/sql/colexec/external/external.go @@ -823,16 +823,6 @@ func makeType(typ *plan.Type, flag bool) types.Type { return types.New(types.T(typ.Id), typ.Width, typ.Scale) } -func initBatch(batchSize int, proc *process.Process, bat *batch.Batch) error { - if err := bat.PreExtend(proc.GetMPool(), batchSize); err != nil { - return err - } - for i := range bat.Vecs { - bat.Vecs[i].SetLength(batchSize) - } - return nil -} - func getRealAttrCnt(attrs []string, cols []*plan.ColDef) int { cnt := 0 for i := 0; i < len(attrs); i++ { @@ -1275,11 +1265,7 @@ func getOneRowDataNonRestrictive(bat *batch.Batch, line []csvparser.Field, rowId continue } vec := bat.Vecs[colIdx] - if param.Cols[colIdx].Hidden { - nulls.Add(vec.GetNulls(), uint64(rowIdx)) - continue - } - nulls.Add(vec.GetNulls(), uint64(rowIdx)) + vector.AppendBytes(vec, nil, true, mp) } return nil } @@ -1296,7 +1282,7 @@ func getColData(bat *batch.Batch, line []csvparser.Field, rowIdx int, param *Ext vec := bat.Vecs[colIdx] if param.Cols[colIdx].Hidden { - nulls.Add(vec.GetNulls(), uint64(rowIdx)) + vector.AppendBytes(vec, nil, true, mp) return nil } @@ -1312,12 +1298,12 @@ func getColData(bat *batch.Batch, line []csvparser.Field, rowIdx int, param *Ext isNullOrEmpty = isNullOrEmpty || len(field.Val) == 0 } if isNullOrEmpty { - nulls.Add(vec.GetNulls(), uint64(rowIdx)) + vector.AppendBytes(vec, nil, true, mp) return nil } if param.ParallelLoad { - err := vector.SetStringAt(vec, rowIdx, field.Val, mp) + err := vector.AppendBytes(vec, []byte(field.Val), false, mp) if err != nil { return err } @@ -1330,7 +1316,7 @@ func getColData(bat *batch.Batch, line []csvparser.Field, rowIdx int, param *Ext if err != nil { return moerr.NewInternalErrorf(param.Ctx, "the input value '%s' is not bool type for column %d", field.Val, colIdx) } - if err := vector.SetFixedAtNoTypeCheck(vec, rowIdx, b); err != nil { + if err = vector.AppendFixed(vec, b, false, mp); err != nil { return err } case types.T_bit: @@ -1346,13 +1332,14 @@ func getColData(bat *batch.Batch, line []csvparser.Field, rowIdx int, param *Ext if val > uint64(1<= size { + if curBatchSize >= param.maxBatchSize { break } } - for i := range bat.Vecs { - bat.Vecs[i].SetLength(cnt) - } n := bat.Vecs[0].Length() if unexpectEOF && n > 0 { n-- diff --git a/test/distributed/cases/load_data/load_data.result b/test/distributed/cases/load_data/load_data.result index dda78bdd082d..fccd10c6a2d1 100644 --- a/test/distributed/cases/load_data/load_data.result +++ b/test/distributed/cases/load_data/load_data.result @@ -9,6 +9,11 @@ col6 smallint unsigned, col7 int unsigned, col8 bigint unsigned ); +load data infile '$resources/load_data/integer_numbers_4.csv' into table t1 fields terminated by ','; +select * from t1; +col1 col2 col3 col4 col5 col6 col7 col8 +1 234 2147483642 92233720368547 254 65533 4294967294 1844674407370956 +delete from t1; load data infile '$resources/load_data/integer_numbers_1.csv' into table t1 fields terminated by ','; select * from t1; col1 col2 col3 col4 col5 col6 col7 col8 diff --git a/test/distributed/cases/load_data/load_data.sql b/test/distributed/cases/load_data/load_data.sql index 6290dac57ab3..078007b71030 100644 --- a/test/distributed/cases/load_data/load_data.sql +++ b/test/distributed/cases/load_data/load_data.sql @@ -13,6 +13,10 @@ col8 bigint unsigned ); -- load data +load data infile '$resources/load_data/integer_numbers_4.csv' into table t1 fields terminated by ','; +select * from t1; +delete from t1; + load data infile '$resources/load_data/integer_numbers_1.csv' into table t1 fields terminated by ','; select * from t1; diff --git a/test/distributed/resources/load_data/integer_numbers_4.csv b/test/distributed/resources/load_data/integer_numbers_4.csv new file mode 100644 index 000000000000..1c4ed1b10383 --- /dev/null +++ b/test/distributed/resources/load_data/integer_numbers_4.csv @@ -0,0 +1 @@ +"1.2","234.4","2147483642.3","92233720368547.4","254.7","65533.3","4294967294.2","1844674407370955.9" \ No newline at end of file