可优化语句执行

概述
CopyOneRowTo函数
- ScalarVector类
- CopySendString 函数
- FixedRowOut 函数
- CopySendInt32 函数
- CopySendData 函数
- appendBinaryStringInfo 函数

声明：本文的部分内容参考了他人的文章。在编写过程中，我们尊重他人的知识产权和学术成果，力求遵循合理使用原则，并在适用的情况下注明引用来源。
本文主要参考了 OpenGauss1.1.0 的开源代码

概述

本文主要围绕列存储进行学习。

CopyOneRowTo函数

CopyOneRowTo 函数的作用是将一个数据行（row）从一个源 ScalarVector 复制到目标 ScalarVector，以实现数据的拷贝。具体来说，它用于在处理批次数据时，从一个源列（ScalarVector）复制数据到另一个目标列（ScalarVector），以便在数据处理过程中进行转换、修改等操作，CopyOneRowTo 函数帮助实现了批次数据的复制和转换。
CopyOneRowTo函数源码如下：（src/gausskernel/optimizer/commands/copy.cpp）

/*
 * Emit one row during CopyTo().
 */
static void CopyOneRowTo(CopyState cstate, Oid tupleOid, Datum* values, const bool* nulls)
{
    bool need_delim = false;  // 标志是否需要添加分隔符
    FmgrInfo* out_functions = cstate->out_functions;  // 输出函数的信息
    MemoryContext oldcontext;  // 保存旧的内存上下文
    ListCell* cur = NULL;  // 遍历属性列表的指针
    char* string = NULL;  // 临时字符串

    // 重置行内存上下文，切换到行内存上下文
    MemoryContextReset(cstate->rowcontext);
    oldcontext = MemoryContextSwitchTo(cstate->rowcontext);

    if (IS_BINARY(cstate)) {
        // 对于二进制格式，发送元组的二进制头部信息
        CopySendInt16(cstate, list_length(cstate->attnumlist));
        // 如果需要，发送 OID
        if (cstate->oids) {
            // 假设 Oid 和 int32 大小相同
            CopySendInt32(cstate, sizeof(int32));
            CopySendInt32(cstate, tupleOid);
        }
    } else if (cstate->oids) {
        // 对于文本格式，如果需要，发送 OID
        // 假设数字不需要引用或编码转换
        string = DatumGetCString(DirectFunctionCall1(oidout, ObjectIdGetDatum(tupleOid)));
        CopySendString(cstate, string);
        need_delim = true;
    }

	// 是否为固定列宽
    if (IS_FIXED(cstate))
        FixedRowOut(cstate, values, nulls);
    else {
        // 遍历属性列表
        foreach (cur, cstate->attnumlist) {
            int attnum = lfirst_int(cur);  // 属性序号
            Datum value = values[attnum - 1];  // 属性值
            bool isnull = nulls[attnum - 1];  // 是否为 NULL 值

            if (cstate->fileformat == FORMAT_CSV || cstate->fileformat == FORMAT_TEXT) {
                // 对于 CSV 或文本格式，添加分隔符
                if (need_delim)
                    CopySendString(cstate, cstate->delim);
                need_delim = true;
            }

            if (isnull) {
                // 处理 NULL 值
                switch (cstate->fileformat) {
                    case FORMAT_CSV:
                    case FORMAT_TEXT:
                        CopySendString(cstate, cstate->null_print_client);
                        break;
                    case FORMAT_BINARY:
                        CopySendInt32(cstate, -1);
                        break;
                    default:
                        ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Invalid file format")));
                }
            } else {
                if (!IS_BINARY(cstate)) {
                    // 非二进制格式，将值转换为字符串并处理
                    string = OutputFunctionCall(&out_functions[attnum - 1], value);
                    switch (cstate->fileformat) {
                        case FORMAT_CSV:
                            CopyAttributeOutCSV(cstate,
                                string,
                                cstate->force_quote_flags[attnum - 1],
                                list_length(cstate->attnumlist) == 1);
                            break;
                        case FORMAT_TEXT:
                            CopyAttributeOutText(cstate, string);
                            break;
                        default:
                            ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Invalid file format")));
                    }
                } else {
                    // 二进制格式，调用输出函数并发送数据
                    bytea* outputbytes = NULL;
                    outputbytes = SendFunctionCall(&out_functions[attnum - 1], value);
                    CopySendInt32(cstate, VARSIZE(outputbytes) - VARHDRSZ);
                    CopySendData(cstate, VARDATA(outputbytes), VARSIZE(outputbytes) - VARHDRSZ);
                }
            }
        }
    }

    // 发送行数据，并切换回旧的内存上下文
    cstate->writelineFunc(cstate);
    (void)MemoryContextSwitchTo(oldcontext);
}

ScalarVector类

ScalarVector 类是一种数据结构，用于存储单一数据列的向量化数据。在数据库系统中，数据通常以表格的形式存储，每个列都包含一组数据。ScalarVector 类的作用是为了优化这些列数据的处理，提高数据访问和计算的效率。
ScalarVector 类的源码如下：（路径：src/include/vecexecutor/vectorbatch.h）

// the core data structure for a column
class ScalarVector : public BaseObject {
    friend class VectorBatch;

public:
    // number of values.
    int m_rows;

    // type desciption information for this scalar value.
    ScalarDesc m_desc;

    // this value means that the value in the scalarvector is always the same
    bool m_const;

    // flags in the scalar value array.
    uint8* m_flag;

    // a company buffer for store the data if the data type is not plain.
    VarBuf* m_buf;

    // the value array.
    ScalarValue* m_vals;

public:
    // decode a variable length data.
    // null value judgement should be outside of this function.
    FORCE_INLINE
    static Datum Decode(ScalarValue val)
    {
        return val;
    }

    // convert a datum to scalar value
    static ScalarValue DatumToScalar(Datum datumVal, Oid datumType, bool isNull);

    template <Oid datumType>
    static ScalarValue DatumToScalarT(Datum datumVal, bool isNull);

public:
    // constructor/deconstructor.
    ScalarVector();
    ~ScalarVector();

    // init the ScalarVector.
    //
    void init(MemoryContext cxt, ScalarDesc desc);
    
    // used in tsdb. init with another ScalarVector object.
    //
    void init(MemoryContext cxt, ScalarVector* vec, const int batchSize);

    // serialize the Scalar vector
    //
    void Serialize(StringInfo buf);

    // serialize the Scalar vector of the particular index
    //
    void Serialize(StringInfo buf, int idx);

    // Deserialize the vector
    //
    char* Deserialize(char* msg, size_t len);

    // Add a variable length data
    // this var may be from
    // cstring, fixed length(> 8) data type, or pg traditional header-contain variable length
    Datum AddVar(Datum data, int index);

    // Add a header-contain variable
    Datum AddVarWithHeader(Datum data);

    // Add a variable without header on a special position. The original variable will be
    // transfered in together with the length of the content. And inside the funtion, the header
    // of the ScalarValue will be added before the actual content according to the data type.
    Datum AddBPCharWithoutHeader(const char* data, int maxLen, int len, int aindex);
    Datum AddVarCharWithoutHeader(const char* data, int len, int aindex);

    // Add a short decimal without header on a special position. The value of decimal
    // will be transfered in by int64 format together with the scale of it. And inside the function,
    // the header will be added and the value will be converted into PG format. Here we only support
    // short decimal which can be stored using int64.
    Datum AddShortNumericWithoutHeader(int64 value, uint8 scale, int aindex);
    Datum AddBigNumericWithoutHeader(int128 value, uint8 scale, int aindex);

    char* AddVars(const char* src, int length);

    // add a normal header-contain val
    Datum AddHeaderVar(Datum data, int index);

    // add a cstring type val
    Datum AddCStringVar(Datum data, int index);

    // add a fixed length val
    template <Size len>
    Datum AddFixLenVar(Datum data, int index);

    // copy a vector
    void copy(ScalarVector* vector, int start_idx, int endIdx);
    void copy(ScalarVector* vector);

    void copyDeep(ScalarVector* vector, int start_idx, int endIdx);
    void copyNth(ScalarVector* vector, int Nth);

    void copy(ScalarVector* vector, const bool* pSel);

    // convert a cstring to Scalar value.
    static Datum DatumCstringToScalar(Datum data, Size len);

    // convert a fixed len datatype to Scalar Value
    static Datum DatumFixLenToScalar(Datum data, Size len);

    FORCE_INLINE
    bool IsNull(int i)
    {
        Assert(i >= 0 && i < m_rows);
        return ((m_flag[i] & V_NULL_MASK) == V_NULL_MASK);
    }

    FORCE_INLINE
    void SetNull(int i)
    {
        Assert(i >= 0 && i < BatchMaxSize);
        m_flag[i] |= V_NULL_MASK;
    }

    FORCE_INLINE
    void SetAllNull()
    {
        for (int i = 0; i < m_rows; i++) {
            SetNull(i);
        }
    }

private:
    // init some function pointer.
    void BindingFp();

    Datum (ScalarVector::*m_addVar)(Datum data, int index);
};

CopySendString 函数

CopySendString 函数，这个函数的目的是将字符串数据添加到 CopyState 结构中的前端消息缓冲区中，以便之后将这些数据发送给客户端。它使用 appendBinaryStringInfo 函数将字符串数据追加到消息缓冲区中。CopySendString 函数源码如下：（src/gausskernel/optimizer/commands/copy.cpp）

// CopySendString does the same for null-terminated strings
void CopySendString(CopyState cstate, const char* str)
{
    appendBinaryStringInfo(cstate->fe_msgbuf, str, strlen(str));
}

appendBinaryStringInfo 函数用于向StringInfo结构中追加任意二进制数据。首先，它会检查 StringInfo 结构是否为空。然后，根据需要分配更多空间以容纳要追加的数据。接下来，使用 memcpy_s 函数将数据追加到 StringInfo 结构的末尾，并更新长度信息。最后，会在字符串的末尾添加一个 null 字符，即使对于二进制数据来说，这个 null 字符可能没有实际用处。appendBinaryStringInfo 函数源码如下：（src/common/backend/lib/stringinfo.cpp）

/*
 * appendBinaryStringInfo
 *
 * 向StringInfo结构追加任意二进制数据，如果需要的话会分配更多空间。
 */
void appendBinaryStringInfo(StringInfo str, const char* data, int datalen)
{
    Assert(str != NULL);  // 断言：确保StringInfo结构非空

    /* 如果需要的话分配更多空间 */
    enlargeStringInfo(str, datalen);

    /* 追加数据 */
    errno_t rc = memcpy_s(str->data + str->len, (size_t)(str->maxlen - str->len), data, (size_t)datalen);
    securec_check(rc, "\0", "\0");
    str->len += datalen;

    /*
     * 保持末尾的null，尽管对于二进制数据来说可能没有用处。
     * （一些调用者处理的是文本数据，但是因为输入没有以null结尾，所以调用了这个函数。）
     */
    str->data[str->len] = '\0';
}

DatumGetCString 是一个宏，作用是将 Datum 类型的数据转换为C字符串。源码如下：（路径：src/include/postgres.h）

/*
 * DatumGetCString
 *		Returns C string (null-terminated string) value of a datum.
 *
 * Note: C string is not a full-fledged Postgres type at present,
 * but type input functions use this conversion for their inputs.
 */

#define DatumGetCString(X) ((char*)DatumGetPointer(X))

FixedRowOut 函数

FixedRowOut 函数是在固定列宽格式下将一行数据输出。它首先根据格式信息扩展输出缓冲区，然后遍历每个字段进行处理，根据字段的值和是否为null，调用相应的处理函数输出数据或null值。源码如下：（路径：src/gausskernel/optimizer/commands/formatter.cpp）

// 固定列宽格式下输出一行数据
void FixedRowOut(CopyState cstate, Datum* values, const bool* nulls)
{
    // 获取输出函数信息和固定列宽格式信息
    FmgrInfo* out_functions = cstate->out_functions; // 输出函数信息
    FixFormatter* formatter = (FixFormatter*)cstate->formatter; // 固定列宽格式信息
    FieldDesc* descs = formatter->fieldDesc; // 字段描述
    char* string = NULL; // 临时字符串指针

    // 根据行大小扩展输出缓冲区
    enlargeStringInfo(cstate->fe_msgbuf, formatter->lineSize);

    // 遍历每个字段进行处理
    for (int i = 0; i < formatter->nfield; i++) {
        // 获取当前字段的属性序号和对应的值
        int attnum = formatter->fieldDesc[i].attnum; // 当前字段属性序号
        Datum value = values[attnum - 1]; // 当前字段值
        bool isnull = nulls[attnum - 1]; // 当前字段是否为null

        // 根据是否为null进行处理
        if (isnull) {
            // 调用AttributeOutFixed函数输出null值
            AttributeOutFixed<false>(cstate, descs[i].nullString, descs + i);
        } else {
            // 对非null值，调用输出函数并输出
            string = OutputFunctionCall(&out_functions[attnum - 1], value);
            Assert(string != NULL);
            AttributeOutFixed<false>(cstate, string, descs + i);
        }
    }
}

CopySendInt32 函数

CopySendInt32 函数用于将一个 int32 类型的值以网络字节序发送出去。它首先将传入的 int32 值转换为网络字节序，并将结果存储在 buf 中，然后通过调用 CopySendData 函数将 buf 中的数据发送出去。函数源码如下：（路径：src/gausskernel/optimizer/commands/copy.cpp）

/*
 * 这些函数会进行一些数据转换
 */

/*
 * CopySendInt32 以网络字节序发送 int32 类型的值
 */
static void CopySendInt32(CopyState cstate, int32 val)
{
    uint32 buf;

    // 将 int32 类型的值转换为网络字节序，并存储在 buf 中
    buf = htonl((uint32)val);

    // 调用 CopySendData 函数将 buf 中的数据发送出去，发送的字节数为 sizeof(buf)
    CopySendData(cstate, &buf, sizeof(buf));
}

CopySendData 函数

这段代码定义了一系列发送数据的函数，这些函数会将指定的数据追加到 cstate->fe_msgbuf 中，其中 cstate 是 CopyState 结构体的指针，表示数据拷贝的状态。这些函数分别用于发送二进制数据、以 null 结尾的字符串、单个字符以及在每行数据末尾执行适当的操作。这些函数并不会对数据进行任何转换，只是简单地将数据追加到消息缓冲区中。

/* ----------
 * CopySendData 将输出数据发送到目标（文件或前端）
 * CopySendString 对以 null 结尾的字符串执行相同操作
 * CopySendChar 对单个字符执行相同操作
 * CopySendEndOfRow 在每行数据末尾执行适当的操作
 *  （实际上只有在 CopySendEndOfRow 时才会刷新数据，其他函数不会刷新数据）
 *
 * 注意：这些函数不会对数据进行任何转换
 * ----------
 */
static void CopySendData(CopyState cstate, const void* databuf, int datasize)
{
    // 调用 appendBinaryStringInfo 函数将指定大小的数据追加到 cstate->fe_msgbuf 中
    appendBinaryStringInfo(cstate->fe_msgbuf, (const char*)databuf, datasize);
}

appendBinaryStringInfo 函数

appendBinaryStringInfo 函数接受一个 StringInfo 结构体指针 str，一个 const char* 类型的数据指针 data，以及一个整数 datalen，表示数据的长度。函数会首先确保 str 不为空，然后根据需要分配更多空间，将指定长度的数据复制到 str 的数据缓冲区中，然后更新已追加数据的长度，并在数据末尾添加一个 null 字符，以保证字符串的正确终止。这个函数通常用于将二进制数据添加到 StringInfo 结构体中，StringInfo 是一个动态字符串结构体，它的大小可以根据需要自动增长。源码如下：（路径：src/common/backend/lib/stringinfo.cpp）

/*
 * appendBinaryStringInfo
 *
 * 将任意的二进制数据追加到 StringInfo 中，如果需要的话会分配更多的空间。
 */
void appendBinaryStringInfo(StringInfo str, const char* data, int datalen)
{
    Assert(str != NULL);  // 断言确保 str 不为空

    /* 如果需要的话分配更多空间 */
    enlargeStringInfo(str, datalen);

    /* 将数据追加到 str 中 */
    errno_t rc = memcpy_s(str->data + str->len, (size_t)(str->maxlen - str->len), data, (size_t)datalen);
    securec_check(rc, "\0", "\0");
    str->len += datalen;  // 更新已追加数据的长度

    /*
     * 保持末尾的 null 字符，即使对于二进制数据它可能没有用处。
     * （一些调用者处理文本，但调用这个函数是因为输入可能没有以 null 结尾。）
     */
    str->data[str->len] = '\0';
}