Python list实现

前言

本文所讲Python实现均为CPython，需读者具备一定的C语言阅读能力。本博文参考了**《Python源码剖析》**与Python2.7源码。PyListObject采用顺序存储（而非链式存储），熟悉数据结构的读者，能很容易明白本博文所讲内容。

介绍

PyListObject是Python提供的List容器实现，与C++ STL中的vector实现机制相近。PyListObject是变长对象同时也是可变对象（很显然，不同时刻List中可以存在不同数目的元素）。

PyListObject定义如下：

typedef struct {
	PyObject_VAR_HEAD
	PyObject **ob_item;
	int allocated;
} PyListObject;

PyObject_VAR_HEAD中的ob_size与PyListObject中的allocated字段分别标识了容器的现有*元素个数（size）**与容器容量（capacity）***。ob_item为指向PyObject *的指针（即PyObject *数组），是PyListObject实现顺序存储的数组。

实现

1、创建对象

Python提供了唯一创建List的函数——PyList_New。下面是简化的后Python创建PyListObject对象的过程。

#define MAXFREELISTS 80
static PyListObject *free_lists[MAXFREELISTS];
static int num_free_ists = 0;

PyObject *PyList_New(int size) {
	PyListObject *op;
	size_t nbytes;
	// 判断int类型是否溢出，若溢出则返回内存分配失败
	nbytes = size * sizeof(PyObject *);
	if(nbytes / sizeof(PyObject *) != (size_t)size) {
		return PyErr_NoMemory();
	}
	//
	if(num_free_lists) {
		// 缓冲池可用，则从缓冲池取一可用List
		num_free_lists--;
		op = free_lists[num_free_lists];
		_Py_NewReference((PyObject *)op);
	} else {
		// 缓冲池不可用，直接新建对象并为Python中的自动垃圾收集机制做一些工作
		op = PyObject_GC_New(PyListObject, &PyList_Type);
	}
	
	if(size <= 0) {
		op->ob_item = NULL;
	} else {
		op->ob_item = (PyObject **)PyMem_MALLOC(nbytes);
		memset(op->ob_item, 0, nbytes);
	}
	op->ob_size = size;
	op->allocated = size;
	return (PyObject *)op;
}

PyListObject对象分为两部分：①PyListObject对象②PyListObject对象容纳的PyObject元素。

2、设置元素

前面提到PyListObject是顺序存储，可以**随机访问**。通过下标设置List中元素值，是由PyList_SetItem函数实现的。

int PyList_SetItem(register PyObject *op, register int i, register PyObject *new_item) {
 	// 保存指向旧元素的指针，用于减少引用计数
	register PyObject *olditem;
	register PyObject **p;
	// 检查索引值得合法性
	if(i < 0 || i>= (PyListObject)op->ob_size) {
		// 报索引错误
		return -1;
	}
	// 设置元素
	p = ((PyListObject*)op)->ob_item + i;
	olditem = *p;
	Py_XDECREF(olditem);
	return 0;
}

3、插入元素

了解顺序存储的读者，很容易想到新元素的插入会导致元素的移动。PyListObject的实现也不例外，而这其中又牵扯了PyListObject.ob_item的*扩容**与缩容***（参考Redis或者其它若干软件的实现，都会有类似机制）。

int PyList_Insert(PyObject *op, Py_ssize_t where, PyObject *new_item) {
	return insl((PyListObject *)op, where, newitem);
}

static int insl(PyListObject *self, Py_ssize_t where, PyObject *v) {
	Py_ssize_t i, n = self->ob_size;
	PyObject **items;
	// 调整列表容量
	if(list_resize(self, n+1) == -1)
		return -1;
	// 确定插入点
	if(where < 0) {
		// 负数索引
		where += n;
		if(where < 0)
			where = 0;
	}
	if(where > n)
		where = n;
	// 插入元素
	items = self->ob_item;
	for(i = n; --i >= where; )
		// 从后往前将元素后移一个单位，空出新元素存储单元
		item[i+1] = item[i]
	// 使用宏Py_INCREF增加元素v的引用计数
	Py_INCREF(v);
	item[where] = v;
	return 0;
}

其中函数list_resize为PyListObject对象*扩容**与缩容***的关键。list_resize函数的实现如下：

static int list_resize(PyObjectList *self, int newsize) {
	PyObject **items;
	size_t new_allocated;
	int allocated = self->allocated;
	// 不需要申请内存
	if(allocated >= newsize && newsize >= (allocated >> 1)) {
		self->ob_size = newsize;
		return 0;
	}
	// 计算重新申请内存的大小
	new_allocated = (newsize >> 3) + (newsize < 9 ? 3 : 6) + newsize;
	if(newsize == 0)
		new_allocated = 0;
	// 扩展列表
	items = self->ob_items;
	// 最终调用c语言的realloc
	PyMem_RESIZE(item, PyObject *, new_allocated);
	self->ob_itme = items;
	self->ob_size = newsize;
	self->allocated = new_allocated;
	return 0;
}

当List新的元素个数newsize，满足条件：allocated/2 <= newsize <= allocated时，不需要进行realloc。当newsize >= allocated时，PyObjectList会进行*扩容**操作，当newsize < allocated/2时PyObjectList会进行缩容***操作。

对象池

CPython为了解决频繁创建对象带来的性能问题（大多数对性能要求较高的C程序均采用类似机制），采用了大量的对象池技术——PyListObject的实现也不例外。如果读者对此类技术不熟悉，请参阅**对象池**设计模式。

在如上PyList_New函数的实现代码中，free_lists指针数组便是用于PyListObject创建的对象池。我们可以看到如果存在可用的PyListObject，Python便会从对象池中取出并返回一个PyListObject对象。那么PyListObject对象是**何时、如何**归还给对象池的呢？答案就在销毁PyListObject的list_dealloc函数里。

static void list_dealloc(PyListObject *op) {
	int i;
	if(op->ob_item != NULL) {
		i = op->ob_size;
		while(--i >= 0) {
			Py_XDECREF(op->ob_item[i]);
		}
		PyMem_FREE(op->ob_item);
	}
	// 释放PyListObject自身
	if(num_free_lists < MAXFREELISTS && PyList_CheckExact(op))
		free_lists[num_free_lists++] = op;
	else
		op->ob_type->tp_free((PyObject *)op);
}