Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
3b37e33
func/regexp_extract: new scalar func and test cases
stephenkgu Apr 21, 2026
6d2e73a
fix ai review issues
stephenkgu Apr 21, 2026
6548192
fix c null-terminated string's mem manipulation
stephenkgu Apr 21, 2026
d770e45
Update source/libs/scalar/src/sclfunc.c
stephenkgu Apr 21, 2026
dc985ca
Update source/libs/scalar/src/sclfunc.c
stephenkgu Apr 21, 2026
c4a5619
Update source/libs/function/src/builtins.c
stephenkgu Apr 21, 2026
6a151e2
Apply suggestions from copilot code review
stephenkgu Apr 21, 2026
c7a78f4
fix group idx range checking with prepared statements
stephenkgu Apr 21, 2026
a41023d
manage mem buffer outside of loop
stephenkgu Apr 21, 2026
b384578
use int64_t to validate group index arg instead of int32_t for potent…
stephenkgu Apr 21, 2026
20f4090
Merge branch '3.0' into fix/6968250338
stephenkgu Apr 22, 2026
3505a52
fix(ext-win): fix external window compilation
stephenkgu Apr 22, 2026
40293fd
Merge branch 'fix/ext-win-compile' into fix/6968250338
stephenkgu Apr 22, 2026
a0f68be
Merge branch '3.0' into fix/6968250338
stephenkgu Apr 22, 2026
f40ccdd
Update source/libs/executor/src/externalwindowoperator.c
stephenkgu Apr 22, 2026
3213b43
Update source/libs/scalar/src/sclfunc.c
stephenkgu Apr 22, 2026
9e090be
Update source/libs/scalar/src/sclfunc.c
stephenkgu Apr 22, 2026
6bdc547
Update test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py
stephenkgu Apr 22, 2026
b11c324
use cleanup block instead of direct return
stephenkgu Apr 22, 2026
d56a93e
use header extra buffer even though its length contains header already
stephenkgu Apr 22, 2026
5878f49
Update source/libs/function/src/builtins.c
stephenkgu Apr 22, 2026
a7b7194
use macro for maximum of group index instead of hard-coded
stephenkgu Apr 22, 2026
99b7b3a
Merge branch 'fix/6968250338' of https://github.com/taosdata/TDengine…
stephenkgu Apr 22, 2026
872601f
Update test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py
stephenkgu Apr 22, 2026
0189d82
Update source/libs/function/src/builtins.c
stephenkgu Apr 22, 2026
37f599a
en & zh doc for this func
stephenkgu Apr 22, 2026
9cf7858
Merge branch 'fix/6968250338' of https://github.com/taosdata/TDengine…
stephenkgu Apr 22, 2026
640f580
Update source/libs/function/src/builtins.c
stephenkgu Apr 22, 2026
38ac272
Merge branch 'fix/6968250338' of https://github.com/taosdata/TDengine…
stephenkgu Apr 22, 2026
816ccb5
fix undefined behavior with reg lib
stephenkgu Apr 22, 2026
984b053
move character converting buffer out of loop to dismiss allocating ea…
stephenkgu Apr 23, 2026
ceeb5cb
copy result data to dismiss dangling pointers
stephenkgu Apr 23, 2026
21e6d42
revert set data value with true, the 4th args is isNull, not isCopy
stephenkgu Apr 23, 2026
7a86c29
fix zh, en doc examples and validate with test cases
stephenkgu Apr 23, 2026
9ccae40
Update source/libs/function/src/builtins.c
stephenkgu Apr 23, 2026
7c471c3
pre-formats error message to fix compilation
stephenkgu Apr 23, 2026
5670d20
remove inaccurate comments
stephenkgu Apr 23, 2026
388cfe0
Update docs/zh/14-reference/03-taos-sql/22-function.md
stephenkgu Apr 23, 2026
7203c84
Update docs/en/14-reference/03-taos-sql/22-function.md
stephenkgu Apr 23, 2026
068a98c
Update source/libs/scalar/src/sclfunc.c
stephenkgu Apr 23, 2026
9652c1e
new test case for big group index
stephenkgu Apr 23, 2026
4e13f1d
new test case for null pattern
stephenkgu Apr 23, 2026
7b2b79d
Update source/libs/scalar/src/sclfunc.c
stephenkgu Apr 23, 2026
663e85b
Update docs/en/14-reference/03-taos-sql/22-function.md
stephenkgu Apr 23, 2026
d0728c9
Update docs/zh/14-reference/03-taos-sql/22-function.md
stephenkgu Apr 23, 2026
be87866
Update test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py
stephenkgu Apr 23, 2026
193997a
Update source/libs/function/src/builtins.c
stephenkgu Apr 23, 2026
b34c9a1
Update test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py
stephenkgu Apr 23, 2026
6e4743f
log reg exec error message to make production debugging actionalbe
stephenkgu Apr 23, 2026
d8b04ac
fix terrno
stephenkgu Apr 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/libs/function/functionMgt.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ typedef enum EFunctionType {
FUNCTION_TYPE_AES_DECRYPT,
FUNCTION_TYPE_SM4_ENCRYPT,
FUNCTION_TYPE_SM4_DECRYPT,
FUNCTION_TYPE_REGEXP_EXTRACT,

// conversion function
FUNCTION_TYPE_CAST = 2000,
Expand Down
1 change: 1 addition & 0 deletions include/libs/scalar/scalar.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ int32_t crc32Function(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOut
int32_t findInSetFunction(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOutput);
int32_t likeInSetFunction(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOutput);
int32_t regexpInSetFunction(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOutput);
int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOutput);
int32_t generateTotpSecretFunction(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOutput);
int32_t generateTotpCodeFunction(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOutput);

Expand Down
84 changes: 82 additions & 2 deletions source/libs/function/src/builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -1102,13 +1102,58 @@ static int32_t translateRand(SFunctionNode* pFunc, char* pErrBuf, int32_t len) {
return TSDB_CODE_SUCCESS;
}

// return type is same as first input parameter's type
static int32_t translateOutFirstIn(SFunctionNode* pFunc, char* pErrBuf, int32_t len) {
static int32_t translateRegexpExtract(SFunctionNode* pFunc, char* pErrBuf, int32_t len) {
FUNC_ERR_RET(validateParam(pFunc, pErrBuf, len));
int32_t numOfParams = LIST_LENGTH(pFunc->pParameterList);

// param[1]: pattern must be a constant VALUE node
SNode* pPatNode = nodesListGetNode(pFunc->pParameterList, 1);
if (QUERY_NODE_VALUE != nodeType(pPatNode)) {
return invaildFuncParaTypeErrMsg(pErrBuf, len, "regexp_extract: pattern must be a constant");
Comment thread
stephenkgu marked this conversation as resolved.
Outdated
}

// Validate the regex pattern compiles as POSIX ERE
SValueNode* pPatVal = (SValueNode*)pPatNode;
if (pPatVal->literal != NULL) {
regex_t re;
int ret = regcomp(&re, pPatVal->literal, REG_EXTENDED);
if (ret != 0) {
char msgbuf[256] = {0};
(void)regerror(ret, &re, msgbuf, sizeof(msgbuf));
regfree(&re);
Comment thread
stephenkgu marked this conversation as resolved.
Outdated
return buildFuncErrMsg(pErrBuf, len, TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR,
"Invalid regex pattern for regexp_extract: %s", msgbuf);
}
regfree(&re);
}
Comment thread
stephenkgu marked this conversation as resolved.
Outdated

// param[2]: group_idx (optional) must be a non-negative integer constant
if (numOfParams == 3) {
SNode* pIdxNode = nodesListGetNode(pFunc->pParameterList, 2);
if (QUERY_NODE_VALUE != nodeType(pIdxNode)) {
return invaildFuncParaTypeErrMsg(pErrBuf, len, "regexp_extract: group_idx must be a constant integer");
}
SValueNode* pIdxVal = (SValueNode*)pIdxNode;
if (!IS_INTEGER_TYPE(pIdxVal->node.resType.type)) {
return invaildFuncParaTypeErrMsg(pErrBuf, len, "regexp_extract: group_idx must be an integer");
}
int64_t groupIdx = taosStr2Int64(pIdxVal->literal, NULL, 10);
if (groupIdx < 0 || groupIdx > 512) {
return invaildFuncParaValueErrMsg(pErrBuf, len, "regexp_extract: group_idx must be between 0 and 512");
Comment thread
stephenkgu marked this conversation as resolved.
Outdated
}
}

// Return type matches str (param[0]): same VARCHAR/NCHAR type and byte width
pFunc->node.resType = *getSDataTypeFromNode(nodesListGetNode(pFunc->pParameterList, 0));
return TSDB_CODE_SUCCESS;
}

// return type is same as first input parameter's type
static int32_t translateOutFirstIn(SFunctionNode* pFunc, char* pErrBuf, int32_t len) {
FUNC_ERR_RET(validateParam(pFunc, pErrBuf, len)); pFunc->node.resType = *getSDataTypeFromNode(nodesListGetNode(pFunc->pParameterList, 0));
Comment thread
stephenkgu marked this conversation as resolved.
Outdated
return TSDB_CODE_SUCCESS;
}

static int32_t translatePlaceHolderPseudoColumn(SFunctionNode* pFunc, char* pErrBuf, int32_t len) {
// pseudo column do not need to check parameters
switch (pFunc->funcType) {
Expand Down Expand Up @@ -7414,6 +7459,41 @@ const SBuiltinFuncDefinition funcMgtBuiltins[] = {
.sprocessFunc = streamPseudoScalarFunction,
.finalizeFunc = NULL,
},
{
.name = "regexp_extract",
.type = FUNCTION_TYPE_REGEXP_EXTRACT,
.classification = FUNC_MGT_SCALAR_FUNC | FUNC_MGT_STRING_FUNC,
.parameters = {.minParamNum = 2,
.maxParamNum = 3,
.paramInfoPattern = 1,
.inputParaInfo[0][0] = {.isLastParam = false,
.startParam = 1,
.endParam = 1,
.validDataType = FUNC_PARAM_SUPPORT_VARCHAR_TYPE | FUNC_PARAM_SUPPORT_NCHAR_TYPE | FUNC_PARAM_SUPPORT_NULL_TYPE,
.validNodeType = FUNC_PARAM_SUPPORT_EXPR_NODE,
.paramAttribute = FUNC_PARAM_NO_SPECIFIC_ATTRIBUTE,
.valueRangeFlag = FUNC_PARAM_NO_SPECIFIC_VALUE,},
.inputParaInfo[0][1] = {.isLastParam = false,
.startParam = 2,
.endParam = 2,
.validDataType = FUNC_PARAM_SUPPORT_VARCHAR_TYPE | FUNC_PARAM_SUPPORT_NCHAR_TYPE,
.validNodeType = FUNC_PARAM_SUPPORT_VALUE_NODE,
.paramAttribute = FUNC_PARAM_NO_SPECIFIC_ATTRIBUTE,
.valueRangeFlag = FUNC_PARAM_NO_SPECIFIC_VALUE,},
.inputParaInfo[0][2] = {.isLastParam = true,
.startParam = 3,
.endParam = 3,
.validDataType = FUNC_PARAM_SUPPORT_INTEGER_TYPE,
Comment thread
stephenkgu marked this conversation as resolved.
Outdated
.validNodeType = FUNC_PARAM_SUPPORT_VALUE_NODE,
.paramAttribute = FUNC_PARAM_NO_SPECIFIC_ATTRIBUTE,
.valueRangeFlag = FUNC_PARAM_NO_SPECIFIC_VALUE,},
.outputParaInfo = {.validDataType = FUNC_PARAM_SUPPORT_VARCHAR_TYPE | FUNC_PARAM_SUPPORT_NCHAR_TYPE}},
.translateFunc = translateRegexpExtract,
.getEnvFunc = NULL,
.initFunc = NULL,
.sprocessFunc = regexpExtractFunction,
.finalizeFunc = NULL,
},
};
// clang-format on

Expand Down
169 changes: 169 additions & 0 deletions source/libs/scalar/src/sclfunc.c
Original file line number Diff line number Diff line change
Expand Up @@ -1817,6 +1817,175 @@ static int32_t base32Encode(const uint8_t *in, int32_t inLen, char *out) {
return outLen;
}

int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOutput) {
int32_t code = TSDB_CODE_SUCCESS;

int32_t numOfRows = pInput[0].numOfRows;
SColumnInfoData *pStrData = pInput[0].columnData;
SColumnInfoData *pPatData = pInput[1].columnData;
SColumnInfoData *pOutputData = pOutput->columnData;

if (numOfRows == 0) {
pOutput->numOfRows = 0;
return TSDB_CODE_SUCCESS;
}

// NULL-type str: all output rows are NULL
if (IS_NULL_TYPE(GET_PARAM_TYPE(&pInput[0])) || IS_NULL_TYPE(GET_PARAM_TYPE(&pInput[1]))) {
colDataSetNNULL(pOutputData, 0, numOfRows);
pOutput->numOfRows = numOfRows;
return TSDB_CODE_SUCCESS;
}

// NULL pattern: all output rows are NULL
if (colDataIsNull_s(pPatData, 0)) {
colDataSetNNULL(pOutputData, 0, numOfRows);
pOutput->numOfRows = numOfRows;
return TSDB_CODE_SUCCESS;
}

// Get group_idx (default 1; param[2] is an optional integer constant)
int32_t groupIdx = 1;
if (inputNum == 3 && !IS_NULL_TYPE(GET_PARAM_TYPE(&pInput[2])) && !colDataIsNull_s(pInput[2].columnData, 0)) {
GET_TYPED_DATA(groupIdx, int32_t, GET_PARAM_TYPE(&pInput[2]),
colDataGetData(pInput[2].columnData, 0),
typeGetTypeModFromColInfo(&pInput[2].columnData->info));
}
Comment thread
stephenkgu marked this conversation as resolved.
if (groupIdx < 0 || groupIdx > 512) {
return TSDB_CODE_FUNC_FUNTION_PARA_VALUE;
}
Comment thread
stephenkgu marked this conversation as resolved.
Outdated
Comment thread
stephenkgu marked this conversation as resolved.

// Build null-terminated UTF-8 pattern string (pattern is a constant, always 1 row)
char patBuf[512];
char *patStr = patBuf;
int32_t patLen = 0;
bool needFreePat = false;
{
char *rawPat = varDataVal(colDataGetData(pPatData, 0));
int32_t rawPatLen = varDataLen(colDataGetData(pPatData, 0));
if (GET_PARAM_TYPE(&pInput[1]) == TSDB_DATA_TYPE_NCHAR) {
SCL_ERR_RET(convNcharToVarchar(rawPat, &patStr, rawPatLen, &patLen, pInput[1].charsetCxt));
needFreePat = true;
} else {
patLen = rawPatLen;
if (patLen >= (int32_t)sizeof(patBuf)) {
patStr = taosMemoryMalloc(patLen + 1);
if (patStr == NULL) return terrno;
needFreePat = true;
}
(void)memcpy(patStr, rawPat, patLen);
patStr[patLen] = '\0';
}
Comment thread
stephenkgu marked this conversation as resolved.
}

// Compile (or retrieve cached) regex — pattern is constant so cache hits every row
regex_t *regex = NULL;
if (threadGetRegComp(&regex, patStr) != 0) {
code = TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR;
Comment thread
stephenkgu marked this conversation as resolved.
Outdated
goto _exit;
Comment thread
stephenkgu marked this conversation as resolved.
}

// regmatch_t array: index 0 = whole match, 1..groupIdx = capture groups
int32_t nmatch = groupIdx + 1;
regmatch_t *pmatch = taosMemoryMalloc(nmatch * sizeof(regmatch_t));
if (pmatch == NULL) {
code = terrno;
goto _exit;
}
Comment thread
stephenkgu marked this conversation as resolved.
Outdated

// Output buffer: same byte width as the str column
int32_t outBufLen = pStrData->info.bytes;
char *outBuf = taosMemoryMalloc(outBufLen);
Comment thread
stephenkgu marked this conversation as resolved.
if (outBuf == NULL) {
Comment thread
stephenkgu marked this conversation as resolved.
taosMemoryFree(pmatch);
code = terrno;
goto _exit;
}

int32_t strType = GET_PARAM_TYPE(&pInput[0]);
bool isNchar = (strType == TSDB_DATA_TYPE_NCHAR);

for (int32_t i = 0; i < numOfRows; i++) {
if (colDataIsNull_s(pStrData, i)) {
colDataSetNULL(pOutputData, i);
continue;
}

char *strRaw = colDataGetData(pStrData, i);
char *strVal = varDataVal(strRaw);
int32_t strLen = varDataLen(strRaw);

// For NCHAR (UCS-4), convert to UTF-8 before matching
char *strUtf8 = strVal;
int32_t strUtf8Len = strLen;
bool needFreeUtf8 = false;
if (isNchar) {
if (convNcharToVarchar(strVal, &strUtf8, strLen, &strUtf8Len, pInput[0].charsetCxt) != 0) {
colDataSetNULL(pOutputData, i);
continue;
Comment thread
stephenkgu marked this conversation as resolved.
Outdated
}
needFreeUtf8 = true;
}

// Null-terminate the string for regexec
char ntBuf[1024];
char *strNt = ntBuf;
bool needFreeNt = false;
if (strUtf8Len >= (int32_t)sizeof(ntBuf)) {
strNt = taosMemoryMalloc(strUtf8Len + 1);
needFreeNt = true;
if (strNt == NULL) {
if (needFreeUtf8) taosMemoryFree(strUtf8);
colDataSetNULL(pOutputData, i);
continue;
Comment thread
stephenkgu marked this conversation as resolved.
Outdated
}
}
(void)memcpy(strNt, strUtf8, strUtf8Len);
strNt[strUtf8Len] = '\0';

int ret = regexec(regex, strNt, nmatch, pmatch, 0);
if (ret != 0 || pmatch[groupIdx].rm_so == -1) {
// REG_NOMATCH, or the requested capture group did not participate
colDataSetNULL(pOutputData, i);
Comment thread
stephenkgu marked this conversation as resolved.
Outdated
} else {
int32_t matchStart = pmatch[groupIdx].rm_so;
int32_t matchLen = pmatch[groupIdx].rm_eo - pmatch[groupIdx].rm_so;

if (isNchar) {
// Convert matched UTF-8 bytes back to NCHAR (UCS-4)
char *matchedNchar = NULL;
int32_t matchedNcharLen = 0;
if (convVarcharToNchar(strNt + matchStart, &matchedNchar, matchLen, &matchedNcharLen,
pInput[0].charsetCxt) != 0) {
colDataSetNULL(pOutputData, i);
Comment thread
stephenkgu marked this conversation as resolved.
Outdated
} else {
*(VarDataLenT *)outBuf = matchedNcharLen;
(void)memcpy(outBuf + VARSTR_HEADER_SIZE, matchedNchar, matchedNcharLen);
taosMemoryFree(matchedNchar);
code = colDataSetVal(pOutputData, i, outBuf, false);
if (code != TSDB_CODE_SUCCESS) terrno = code;
}
} else {
*(VarDataLenT *)outBuf = matchLen;
(void)memcpy(outBuf + VARSTR_HEADER_SIZE, strNt + matchStart, matchLen);
code = colDataSetVal(pOutputData, i, outBuf, false);
if (code != TSDB_CODE_SUCCESS) terrno = code;
Comment thread
stephenkgu marked this conversation as resolved.
}
}

if (needFreeNt) taosMemoryFree(strNt);
if (needFreeUtf8) taosMemoryFree(strUtf8);
if (code != TSDB_CODE_SUCCESS) break;
}

taosMemoryFree(outBuf);
taosMemoryFree(pmatch);
_exit:
if (needFreePat) taosMemoryFree(patStr);
pOutput->numOfRows = numOfRows;
return code;
}

int32_t generateTotpSecretFunction(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOutput) {
SColumnInfoData *pInputData = pInput->columnData;
SColumnInfoData *pOutputData = pOutput->columnData;
Expand Down
Loading
Loading