postgresql之page分配管理(一)

news/2024/7/9 20:55:20 标签: postgresql

一、简介

postgresql是多进程架构,同时也是一个庞大的共享内存系统,大部分的数据都是在共享内存中,供多进程进行协作处理,今天主要是buffer(page)。

二、page 大小

page默认大小是8kB,可以在编译时进行调整,最大32kB

2.1 如何配置

configure.ac

#
# Block size
#
AC_MSG_CHECKING([for block size])
PGAC_ARG_REQ(with, blocksize, [BLOCKSIZE], [set table block size in kB [8]],
             [blocksize=$withval],
             [blocksize=8])
case ${blocksize} in
  1) BLCKSZ=1024;;
  2) BLCKSZ=2048;;
  4) BLCKSZ=4096;;
  8) BLCKSZ=8192;;
 16) BLCKSZ=16384;;
 32) BLCKSZ=32768;;
  *) AC_MSG_ERROR([Invalid block size. Allowed values are 1,2,4,8,16,32.])
esac
AC_MSG_RESULT([${blocksize}kB])

AC_DEFINE_UNQUOTED([BLCKSZ], ${BLCKSZ}, [
 Size of a disk block --- this also limits the size of a tuple.  You
 can set it bigger if you need bigger tuples (although TOAST should
 reduce the need to have large tuples, since fields can be spread
 across multiple tuples).

 BLCKSZ must be a power of 2.  The maximum possible value of BLCKSZ
 is currently 2^15 (32768).  This is determined by the 15-bit widths
 of the lp_off and lp_len fields in ItemIdData (see
 include/storage/itemid.h).

 Changing BLCKSZ requires an initdb.
])

默认值为8kB,可以通过 --with-blocksize=x进行设置,x取值范围[1,2,4,8,16,32], 最大值32768,2的15次方

2.2 为啥最大只能32kB

src/include/storage/itemid.h

typedef struct ItemIdData
{
	unsigned	lp_off:15,		/* offset to tuple (from start of page) */
				lp_flags:2,		/* state of line pointer, see below */
				lp_len:15;		/* byte length of tuple */
} ItemIdData;

指定每个记录在page中的偏移量和长度字段只有15bit,这就限制了page的大小。

三、初始化

postgresql在启动时,会进行相应的初始化。

初始化调用栈

src/backend/main/main.c
main()
	src/backend/postmaster/postmaster.c
	PostmasterMain()
		reset_shared();
			src/backend/storage/ipc/ipci.c
			CreateSharedMemoryAndSemaphores()
				src/backend/storage/buffer/buf_init.c
				InitBufferPool();
					src/backend/storage/buffer/freelist.c
					StrategyInitialize(!foundDescs);
						src/backend/storage/buffer/buf_table.c
						InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS);
void
CreateSharedMemoryAndSemaphores(void)
{
	...
//第一步 计算总共需要的共享内存大小
		size = 100000;
		...
		size = add_size(size, BufferShmemSize());
		size = add_size(size, LockShmemSize());
		...
		
//第二步 将计算的size进行8k对齐		
		/* might as well round it off to a multiple of a typical page size */
		size = add_size(size, 8192 - (size % 8192));

	
// 第三步 创建共享内存
		seghdr = PGSharedMemoryCreate(size, &shim);

// 第四步 初始化共享内存各个部分
		InitShmemAccess(seghdr);

		
	/*
	 * Set up shared memory allocation mechanism
	 */
	if (!IsUnderPostmaster)
		InitShmemAllocation();

	/*
	 * Now initialize LWLocks, which do shared memory allocation and are
	 * needed for InitShmemIndex.
	 */
	CreateLWLocks();

	/*
	 * Set up shmem.c index hashtable
	 */
	InitShmemIndex();

	dsm_shmem_init();

	...
	InitBufferPool();

	/*
	 * Set up lock manager
	 */
	InitLocks();

	/*
	 * Set up predicate lock manager
	 */
	InitPredicateLocks();

	/*
	 * Set up process table
	 */
	if (!IsUnderPostmaster)
		InitProcGlobal();
	CreateSharedProcArray();
	CreateSharedBackendStatus();
	TwoPhaseShmemInit();
	BackgroundWorkerShmemInit();

	/*
	 * Set up shared-inval messaging
	 */
	CreateSharedInvalidationState();

	/*
	 * Set up interprocess signaling mechanisms
	 */
	PMSignalShmemInit();
	ProcSignalShmemInit();
	CheckpointerShmemInit();
	AutoVacuumShmemInit();
	ReplicationSlotsShmemInit();
	ReplicationOriginShmemInit();
	WalSndShmemInit();
	WalRcvShmemInit();
	PgArchShmemInit();
	ApplyLauncherShmemInit();

	/*
	 * Set up other modules that need some shared memory space
	 */
	SnapMgrInit();
	BTreeShmemInit();
	SyncScanShmemInit();
	AsyncShmemInit();
	...
}

这里主要关注InitBufferPool

3.1 创建buffer

src/backend/utils/init/globals.c

int			NBuffers = 1000;

src/backend/storage/buffer/buf_init.c

void
InitBufferPool(void)
{
	...

	/* Align descriptors to a cacheline boundary. */
	BufferDescriptors = (BufferDescPadded *)
		ShmemInitStruct("Buffer Descriptors",
						NBuffers * sizeof(BufferDescPadded),
						&foundDescs);

	BufferBlocks = (char *)
		ShmemInitStruct("Buffer Blocks",
						NBuffers * (Size) BLCKSZ, &foundBufs);

	/* Align condition variables to cacheline boundary. */
	BufferIOCVArray = (ConditionVariableMinimallyPadded *)
		ShmemInitStruct("Buffer IO Condition Variables",
						NBuffers * sizeof(ConditionVariableMinimallyPadded),
						&foundIOCV);
}

3.2 初始化buffer header

		/*
		 * Initialize all the buffer headers.
		 */
		for (i = 0; i < NBuffers; i++)
		{
			BufferDesc *buf = GetBufferDescriptor(i);

			CLEAR_BUFFERTAG(buf->tag);

			pg_atomic_init_u32(&buf->state, 0);
			buf->wait_backend_pid = 0;

			buf->buf_id = i;

			/*
			 * Initially link all the buffers together as unused. Subsequent
			 * management of this list is done by freelist.c.
			 */
			buf->freeNext = i + 1;

			...
		}

		/* Correct last entry of linked list */
		GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST;
	}

四、buffer组织管理

buffer已经有了,但是要如何快速的找到一个buffer?哪些buffer没有使用?哪些已经使用? 这里使用了一个hash表以及一个freelist。

4.1 创建hashtable

void
InitBufTable(int size)
{
	HASHCTL		info;

	/* assume no locking is needed yet */

	/* BufferTag maps to Buffer */
	info.keysize = sizeof(BufferTag);
	info.entrysize = sizeof(BufferLookupEnt);
	info.num_partitions = NUM_BUFFER_PARTITIONS;

	SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table",
								  size, size,
								  &info,
								  HASH_ELEM | HASH_BLOBS | HASH_PARTITION);
}
HTAB *
ShmemInitHash(const char *name,		/* table string name for shmem index */
			  long init_size,	/* initial table size */
			  long max_size,	/* max size of the table */
			  HASHCTL *infoP,	/* info about key and bucket size */
			  int hash_flags)	/* info about infoP */
{
	...
	/* look it up in the shmem index */
	location = ShmemInitStruct(name,
							   hash_get_shared_size(infoP, hash_flags),
							   &found);
  ...
Size
hash_get_shared_size(HASHCTL *info, int flags)
{
	Assert(flags & HASH_DIRSIZE);
	Assert(info->dsize == info->max_dsize);
	return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
}

4.2 初始化hashtable

src/backend/utils/hash/dynahash.c

static bool
init_htab(HTAB *hashp, long nelem)
{
	HASHHDR    *hctl = hashp->hctl;
	HASHSEGMENT *segp;
	int			nbuckets;
	int			nsegs;
	int			i;

	/*
	 * initialize mutexes if it's a partitioned table
	 */
	if (IS_PARTITIONED(hctl))
		for (i = 0; i < NUM_FREELISTS; i++)
			SpinLockInit(&(hctl->freeList[i].mutex));

	/*
	 * Allocate space for the next greater power of two number of buckets,
	 * assuming a desired maximum load factor of 1.
	 */
	nbuckets = next_pow2_int(nelem);

	/*
	 * In a partitioned table, nbuckets must be at least equal to
	 * num_partitions; were it less, keys with apparently different partition
	 * numbers would map to the same bucket, breaking partition independence.
	 * (Normally nbuckets will be much bigger; this is just a safety check.)
	 */
	while (nbuckets < hctl->num_partitions)
		nbuckets <<= 1;

	hctl->max_bucket = hctl->low_mask = nbuckets - 1;
	hctl->high_mask = (nbuckets << 1) - 1;

	/*
	 * Figure number of directory segments needed, round up to a power of 2
	 */
	nsegs = (nbuckets - 1) / hctl->ssize + 1;
	nsegs = next_pow2_int(nsegs);
....

	/* Allocate initial segments */
	for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
	{
		*segp = seg_alloc(hashp);
		if (*segp == NULL)
			return false;
	}

	/* Choose number of entries to allocate at a time */
	hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);

	return true;
}
static HASHSEGMENT
seg_alloc(HTAB *hashp)
{
	HASHSEGMENT segp;

	CurrentDynaHashCxt = hashp->hcxt;
	segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);

	if (!segp)
		return NULL;

	MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);

	return segp;
}

4.3 创建freelist

src/backend/utils/hash/dynahash.c

...
for (i= 0; i < freelist_partitions; i++)
{
	int			temp = (i == 0) ? nelem_alloc_first : nelem_alloc;

	if (!element_alloc(hashp, temp, i))
		ereport(ERROR,
				(errcode(ERRCODE_OUT_OF_MEMORY),
				 errmsg("out of memory")));
}
...
static bool
element_alloc(HTAB *hashp, int nelem, int freelist_idx)
{
	HASHHDR    *hctl = hashp->hctl;
	Size		elementSize;
	HASHELEMENT *firstElement;
	HASHELEMENT *tmpElement;
	HASHELEMENT *prevElement;
	int			i;

	if (hashp->isfixed)
		return false;

	/* Each element has a HASHELEMENT header plus user data. */
	elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);

	CurrentDynaHashCxt = hashp->hcxt;
	firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);

	if (!firstElement)
		return false;

	/* prepare to link all the new entries into the freelist */
	prevElement = NULL;
	tmpElement = firstElement;
	for (i = 0; i < nelem; i++)
	{
		tmpElement->link = prevElement;
		prevElement = tmpElement;
		tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
	}

	/* if partitioned, must lock to touch freeList */
	if (IS_PARTITIONED(hctl))
		SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);

	/* freelist could be nonempty if two backends did this concurrently */
	firstElement->link = hctl->freeList[freelist_idx].freeList;
	hctl->freeList[freelist_idx].freeList = prevElement;

	if (IS_PARTITIONED(hctl))
		SpinLockRelease(&hctl->freeList[freelist_idx].mutex);

	return true;
}

五、结构图

在这里插入图片描述

六、分配

src/backend/storage/buffer/bufmgr.c

static BufferDesc *
BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
			BlockNumber blockNum,
			BufferAccessStrategy strategy,
			bool *foundPtr){
...
}

6.1 根据BufferTag计算hash值

	/* create a tag so we can lookup the buffer */
	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
	
	newHash = BufTableHashCode(&newTag);

6.2 hash表中查找

	buf_id = BufTableLookup(&newTag, newHash);

6.3 没有找到,分配新的一个

for (;;)
	{
	...
		buf = StrategyGetBuffer(strategy, &buf_state);
	...
}
BufferDesc *
StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
{
	BufferDesc *buf;
	......
	pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
	...
	if (StrategyControl->firstFreeBuffer >= 0)
	{
		while (true)
		{
			...
			if (StrategyControl->firstFreeBuffer < 0)
			{
				...
				break;
			}

			buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer);

			/* Unconditionally remove buffer from freelist */
			StrategyControl->firstFreeBuffer = buf->freeNext;
			buf->freeNext = FREENEXT_NOT_IN_LIST;

			...
				return buf;
			...
		}
	}
	....
}

以上的代码把各种判断逻辑都删除了,只剩整个结构,可以看出通过StrategyControl->firstFreeBuffer获取到第一个空闲的buffer。

然后从freelist中获取一个节点将这个id和buffer进行关联,将节点从freelist中移除(指针改变),然后将此节点加入hash中。

在这里插入图片描述

七、后续

  1. 随着page的分配,当分配完后,是扩容还是淘汰? 如何淘汰?
  2. freelist中的节点分配完了,如何操作?

http://www.niftyadmin.cn/n/1829967.html

相关文章

横竖屏切换时不销毁当前activity 和 锁定屏幕

首先在Mainifest.xml的Activity元素中加入android:configChanges"orientation|keyboardHidden"属性 <activityandroid:name".FileBrowser"android:label"string/app_name"android:configChanges"orientation|keyboardHidden"> &…

Flag 1

水了大学三年&#xff0c;发现周围的人已经找到所爱之物&#xff0c;我却还在迷茫。 从选专业开始&#xff0c;就一味的随波逐流&#xff0c;身在这个时代的浪潮之中&#xff0c;难免会追名逐利。 但是人不能永远做自己讨厌之事&#xff0c;或许你喜欢的不是主流的&#xff0c;…

可翻折的TextViewExpandableTextView

转载请注明&#xff1a;http://blog.csdn.net/ddwhan0123&#xff0c;谢谢 今天上的是一个在项目中运用到的一个开源框架ExpandableTextView。 效果图如下&#xff1a; 点击前的效果: 点击后的效果: 样式不复杂就是一个会折叠的TextView&#xff0c;话不多说&#xff0c;上…

移动开发笔记

2015年2月9日 18:09:11 因为移动端app有新老版本的区别 服务端推送给app的数据, 老版本可能解析不了 因此在移动端设计的时候要定义个"白名单", 服务端来的数据, 只解析在白名单内的数据 防止因为有新数据返回时, 老版本解析不了而出错, 或者, 服务端为了兼容做太多的…

postgresql之page分配管理(二)

一、淘汰page 1.1 使用时钟轮转算法获取淘汰buffer static inline uint32 ClockSweepTick(void) {uint32 victim;/** Atomically move hand ahead one buffer - if theres several processes* doing this, this can lead to buffers being returned slightly out of* appare…

从excel文件中获取数据(2)

本方法引用 Aspose.Cells.dll&#xff0c;ICSharpCode.SharpZipLib.dll &#xff0c;NPOI.dll&#xff0c;NPOI.OOXML.dll&#xff0c;NPOI.OpenXml4Net.dll&#xff0c;NPOI.OpenXmlFormats.dll static void Main(string[] args) { string filepath "…

2016-11-22(1)(C#相关)---数组

---------------------------------------------------------------------------------------数组--------------------------------------------------------------------------------------- 一维数组 多维数组 数组的数组 一维数组 性能最好 System.Array 是所有数组的基类…

在hadoop作业中自定义分区和归约

版权声明&#xff1a;本文为博主原创文章&#xff0c;未经博主允许不得转载。 https://blog.csdn.net/qq1010885678/article/details/43735703 当遇到有特殊的业务需求时&#xff0c;需要对hadoop的作业进行分区处理 那么我们可以通过自定义的分区类来实现 还是通过单词计数的例…