sheepdog 源码学习（1）

最近比较忙，做毕设（sheepdog)，要在sheepdog的基础上做点东西，也就是说要改sheepdog源码。我只有一个月时间，所以最近一直都在读sheepdog源码。C 语言太菜，里面好多用法都不会，基本都是现边看边查。。。

sheepdog 可以干什么，首先，你需要启动sheep，启动的命令可以用下面这个。当然可以用 test 目录下的测试脚本来启动一个虚拟的 sheepdog 集群。何谓虚拟呢？就是这个脚本可以在你的主机上启动5个sheep进程，每个sheep进程占用不同的端口，这样这5个sheep进程就组成了一个sheepdog集群了。这样，你也就拥有一个sheepdog集群了。之后，你就可以用dog命令来看一些东西了，比如： dog node list, dog vdi list. 之类的，具体是什么意思就需要你去 sheepdog github 去读点文档啦。。。和系统相关的先说这些吧，这只是个开始，但并不是我重点。再罗嗦一点，这样默认启动后，sheepdog 的日志目录是在 /tmp/sheepdog/ 目录下。这个目录下面有这样几个子目录。首先可能是 0/ 1/ 2/ 3/ ... 以数字命名的目录，这表示这是 node ID 的log 目录。因为你是再主机上虚拟出的几个 sheep，每一个sheep 都要有一个 log 目录，这是自然的啦。。。进入任意一个目录，就拿0/作比方吧，进去可以看到有这样几个文件，obj/ sheep.log epoch sock config 等文件。其中 obj 就是用来存放数据块的目录哦，如果你给你创建的vdi 写入了比较多的东西，那么这个目录下应该有很多文件，并且每一个都是 4M。哈哈，不信你可以试一下。当然，要读系统debug 日志的话，打开 sheep.log 读就可以了。

sheep /tmp/sheepdog/4 -z 4 -p 7004 -c local -n -y 127.0.0.1 -d
# 当然，这是我从 test 目录下的脚本里偷出来的哈。。。

dog 是 sheepdog 中很重要的一个组成部分，它提供了一系列的系统命令。要分析整个系统的功能，我们当然可以从这里入手，顺藤摸瓜。。。好吧。开始吧。首先我们找到这个文件，会是谁呢，当然是 dog.c 啦，还能有谁啊，打开它啊，从哪里对呢，当然是从 main开始啊。开始看吧。

下面是 dog.c 的main函数部分。当然，有些内容我省略掉了。首先是这个 init_commands(&commands) 这个函数，我们需要注意下，我们跟进去，看看这个函数具体在干什么。

int main(int argc, char **argv)
{
	int ch, longindex, ret;
	unsigned long flags;
	struct option *long_options;
	const struct command *commands;
	const char *short_options;
	char *p;
	const struct sd_option *sd_opts;
	uint8_t sdhost[16];
	int sdport;
	install_crash_handler(crash_handler);

	init_commands(&commands);

	if (argc < 2)
		usage(commands, 0);

	flags = setup_commands(commands, argv[1], argv[2]);

	optind = 3;

	sd_opts = build_sd_options(command_opts);
	long_options = build_long_options(sd_opts);
	short_options = build_short_options(sd_opts);

	mytest_func();

	while ((ch = getopt_long(argc, argv, short_options, long_options,
				&longindex)) >= 0) {

	      ......
	}
	if (!is_stdout_console() || raw_output)
		highlight = false;

	if (flags & CMD_NEED_NODELIST) {
		ret = update_node_list(SD_MAX_NODES);
		if (ret < 0) {
			sd_err("Failed to get node list");
			exit(EXIT_SYSFAIL);
		}
	}

	if (flags & CMD_NEED_ARG && argc == optind)
		subcommand_usage(argv[1], argv[2], EXIT_USAGE);

	if (init_event(EPOLL_SIZE) < 0)
		exit(EXIT_SYSFAIL);

	if (init_work_queue(get_nr_nodes) != 0) {
		sd_err("Failed to init work queue");
		exit(EXIT_SYSFAIL);
	}

	if (sockfd_init()) {
		sd_err("sockfd_init() failed");
		exit(EXIT_SYSFAIL);
	}
	ret = command_fn(argc, argv);
	if (ret == EXIT_USAGE)
		subcommand_usage(argv[1], argv[2], EXIT_USAGE);
	return ret;
}

init_commands(const struct command **commands) 命令初始化函数。

static void init_commands(const struct command **commands)
{
	// This is static and be assignment and returned as a pointer.
	static struct command *cmds;
	struct command command_list[] = {
		vdi_command, // in vdi.c
		node_command, // in node.c
		cluster_command, // in cluster.c
		trace_command,	// in dog.h
		{NULL,}
	};

	if (!cmds) {
		cmds = (struct command *)xmalloc(sizeof(command_list));
		memcpy(cmds, command_list, sizeof(command_list));
	}

	*commands = cmds;
	return;
}

vdi_command,

struct command vdi_command = {
	"vdi",
	vdi_cmd,
	vdi_parser
};

vdi_cmd. 好啦，到这里就差不多了，这里你应该有点感觉了，你再命令行中敲出来的命令，都是存放再这个地方的，对你的每一个命令的响应，也是再这里做出的。比如 dog vdi create.就是第二个 vdi_cmd[1] 所对应的内容. vdi_create 是一个指向函数的指针。我们可以去看看这个函数的具体内容。

static struct subcommand vdi_cmd[] = {
	{"check", "<vdiname>", "saph", "check and repair image's consistency",
	 NULL, CMD_NEED_NODELIST|CMD_NEED_ARG,
	 vdi_check, vdi_options},
	{"create", "<vdiname> <size>", "Pycaphrv", "create an image",
	 NULL, CMD_NEED_NODELIST|CMD_NEED_ARG,
	 vdi_create, vdi_options},
	{"snapshot", "<vdiname>", "saphrv", "create a snapshot",
	 NULL, CMD_NEED_ARG,
	 vdi_snapshot, vdi_options},
	{"clone", "<src vdi> <dst vdi>", "sPcaphrv", "clone an image",
	 NULL, CMD_NEED_ARG,
	 vdi_clone, vdi_options},
	{"delete", "<vdiname>", "saph", "delete an image",
	 NULL, CMD_NEED_ARG,
	 vdi_delete, vdi_options},
	{"rollback", "<vdiname>", "saphfrv", "rollback to a snapshot",
	 NULL, CMD_NEED_ARG,
	 vdi_rollback, vdi_options},
	{"list", "[vdiname]", "aprh", "list images",
	 NULL, 0, vdi_list, vdi_options},
	{"tree", NULL, "aph", "show images in tree view format",
	 NULL, 0, vdi_tree, vdi_options},
	{"graph", NULL, "aph", "show images in Graphviz dot format",
	 NULL, 0, vdi_graph, vdi_options},
	{"object", "<vdiname>", "isaph", "show object information in the image",
	 NULL, CMD_NEED_NODELIST|CMD_NEED_ARG,
	 vdi_object, vdi_options},
	{"track", "<vdiname>", "isaph", "show the object epoch trace in the image",
	 NULL, CMD_NEED_NODELIST|CMD_NEED_ARG,
	 vdi_track, vdi_options},
	{"setattr", "<vdiname> <key> [value]", "dxaph", "set a VDI attribute",
	 NULL, CMD_NEED_ARG,
	 vdi_setattr, vdi_options},
	{"getattr", "<vdiname> <key>", "aph", "get a VDI attribute",
	 NULL, CMD_NEED_ARG,
	 vdi_getattr, vdi_options},
	{"resize", "<vdiname> <new size>", "aph", "resize an image",
	 NULL, CMD_NEED_ARG,
	 vdi_resize, vdi_options},
	{"read", "<vdiname> [<offset> [<len>]]", "saph", "read data from an image",
	 NULL, CMD_NEED_ARG,
	 vdi_read, vdi_options},
	{"write", "<vdiname> [<offset> [<len>]]", "apwh", "write data to an image",
	 NULL, CMD_NEED_ARG,
	 vdi_write, vdi_options},
	{"backup", "<vdiname> <backup>", "sFaph", "create an incremental backup between two snapshots",
	 NULL, CMD_NEED_NODELIST|CMD_NEED_ARG,
	 vdi_backup, vdi_options},
	{"restore", "<vdiname> <backup>", "saph", "restore snapshot images from a backup",
	 NULL, CMD_NEED_NODELIST|CMD_NEED_ARG,
	 vdi_restore, vdi_options},
	{"cache", "<vdiname>", "saph", "Run 'dog vdi cache' for more information",
	 vdi_cache_cmd, CMD_NEED_ARG,
	 vdi_cache, vdi_options},
	{NULL,},
};

vdi_create 额，不好意思，我只是随便选了一个，没想到这个函数这么长，但是创建一个 vdi 的过程就是这样的。当然里面又引出了很多新的东西，这就是需要我们去认真分析的东西。相信已经看到了，里面最重要的过程应该是那个 ret = do_vdi_create() 过程。那是下一个应该考虑的过程。

static int vdi_create(int argc, char **argv)
{
	const char *vdiname = argv[optind++];
	uint64_t size;
	uint32_t vid;
	uint64_t oid;
	uint32_t idx, max_idx, ret, nr_copies = vdi_cmd_data.nr_copies;
	struct sd_inode *inode = NULL;

	if (!argv[optind]) {
		sd_err("Please specify the VDI size");
		return EXIT_USAGE;
	}
	ret = option_parse_size(argv[optind], &size);
	if (ret < 0)
		return EXIT_USAGE;

	if (size > SD_OLD_MAX_VDI_SIZE && 0 == vdi_cmd_data.store_policy) {
		sd_err("VDI size is larger than %s bytes, please use '-y' to "
		       "create a hyper volume with size up to %s bytes",
		       strnumber(SD_OLD_MAX_VDI_SIZE),
		       strnumber(SD_MAX_VDI_SIZE));
		return EXIT_USAGE;
	}

	if (size > SD_MAX_VDI_SIZE) {
		sd_err("VDI size is too large");
		return EXIT_USAGE;
	}

	if (nr_copies > sd_nodes_nr) {
		sd_err("There are not enough nodes(%d) to hold the copies(%d)",
		       sd_nodes_nr, nr_copies);
		return EXIT_USAGE;
	}

	ret = do_vdi_create(vdiname, size, 0, &vid, false,
			    vdi_cmd_data.nr_copies, vdi_cmd_data.copy_policy,
			    vdi_cmd_data.store_policy);
	if (ret != EXIT_SUCCESS || !vdi_cmd_data.prealloc)
		goto out;

	inode = xmalloc(sizeof(*inode));

	ret = dog_read_object(vid_to_vdi_oid(vid), inode, sizeof(*inode), 0,
			      true);
	if (ret != SD_RES_SUCCESS) {
		sd_err("Failed to read a newly created VDI object");
		ret = EXIT_FAILURE;
		goto out;
	}
	max_idx = DIV_ROUND_UP(size, SD_DATA_OBJ_SIZE);

	for (idx = 0; idx < max_idx; idx++) {
		vdi_show_progress(idx * SD_DATA_OBJ_SIZE, inode->vdi_size);
		oid = vid_to_data_oid(vid, idx);

		ret = dog_write_object(oid, 0, NULL, 0, 0, 0, inode->nr_copies,
				      inode->copy_policy, true, true);
		if (ret != SD_RES_SUCCESS) {
			ret = EXIT_FAILURE;
			goto out;
		}

		INODE_SET_VID(inode, idx, vid);
		ret = sd_inode_write_vid(dog_bnode_writer, inode, idx, vid, vid,
					 0, false, true);
		if (ret) {
			ret = EXIT_FAILURE;
			goto out;
		}
	}
	vdi_show_progress(idx * SD_DATA_OBJ_SIZE, inode->vdi_size);
	ret = EXIT_SUCCESS;

	if (verbose) {
		if (raw_output)
			printf("%x\n", vid);
		else
			printf("VDI ID of newly created VDI: %x\n", vid);
	}

out:
	free(inode);
	return ret;
}

上面只是讲了一下 sheepdog 中 dog 的一小部分，并且没有深究，当然，我们是需要深究这部分的，看每一个功能从上层到下层的具体实现，这都是很有必要的。今天这些只是讲了最外层的部分，从交互入手，我想这也是认识一个系统的一个比较自然的过程吧。由于时间关系，先写这些，欢迎讨论，待续。。。