理解 Linux Kernel (6)

前一篇已经描述对文件系统进行了宏观性的描述，这一篇，将以特定的文件读写操作为示例，串联对整个文件系统的基本操作。

首先先来看看平台相关的文件读写操作的 C 代码是怎样一个调用方式

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>

int panic()
{
    fprintf(stderr, "%s (errno=%d)\n", strerror(errno), errno);
    return -1;
}

int main(int argc, char *argv[])
{
    /* 打开文件 frw.txt (以可读写 | 若不存在则新建的形式) */
    int fd = open("/root/frw.txt", O_RDWR | O_CREAT);
    if (fd == -1)
        return panic();

    /* 向文件写入 Hello World! 共计 12 个字符 */
    ssize_t wsize = write(fd, "Hello World!", 12);
    if (wsize == -1)
        return panic();

    /* 重定位文件读写指针 */
    off_t off = lseek(fd, 0, SEEK_SET);
    if (off == -1)
        return panic();

    char* buf = (char *) malloc(wsize);
    /* 读取文件内容 */
    ssize_t rsize = read(fd, buf, wsize);
    if (rsize == -1)
        return panic();

    printf("%s\n", buf);
    free(buf);
    /* 关闭文件 */
    int stat = close(fd);
    if (stat == -1)
        return panic();

    return 0;
}

高速缓冲区初始化

上一篇已经描述过了，文件系统的结构、包括数据，都是持久化地存储在存储设备中的。

但是，我们应该也隐约的了解另一个事实，文件读写操作并不会直接操作存储设备上的数据，而是先经过一个称之为高速缓冲的内存区域。

那么，高速缓冲是什么? 究竟承担什么工作? 先来看看它的初始化流程吧。

首先回到 main.c (内核代码的主函数)

void main(void)	
{
 	ROOT_DEV = ORIG_ROOT_DEV;
 	drive_info = DRIVE_INFO;
	memory_end = (1<<20) + (EXT_MEM_K<<10);
	memory_end &= 0xfffff000;
	if (memory_end > 16*1024*1024)
		memory_end = 16*1024*1024;
	if (memory_end > 12*1024*1024)
		buffer_memory_end = 4*1024*1024;
	else if (memory_end > 6*1024*1024)
		buffer_memory_end = 2*1024*1024;
	else
		buffer_memory_end = 1*1024*1024;
	main_memory_start = buffer_memory_end;
#ifdef RAMDISK
	main_memory_start += rd_init(main_memory_start, RAMDISK*1024);
#endif
	mem_init(main_memory_start,memory_end);
	trap_init();
	blk_dev_init();
	chr_dev_init();
	tty_init();
	time_init();
	sched_init();                       // 第四篇已经讲过，负责任务调度模块的初始化
	buffer_init(buffer_memory_end);     // 本篇的起始，负责缓冲区的初始化
	hd_init();
	floppy_init();
	sti();
	move_to_user_mode();
	if (!fork()) {		/* we count on this going ok */
		init();
	}
	for(;;) pause();
}

buffer_init(buffer_memory_end); 用来初始化缓冲区。此处有几个原因:

CPU 读写操作如果直接操作外存，速度上是一个极大的考验。毕竟内存已经较之 CPU 速度慢，外存的读写速度就更慢了。
解耦，其实读写操作并不仅仅发生在外存(块存储设备)，同样的，字符设备等等也都会需要读写操作，增加中间层可以封装变化。
更多，个人了解有限…

struct buffer_head {
 char * b_data;
 unsigned long b_blocknr;
 unsigned short b_dev;
 unsigned char b_uptodate;
 unsigned char b_dirt;
 unsigned char b_count;
 unsigned char b_lock;
 struct task_struct * b_wait;
 struct buffer_head * b_prev;
 struct buffer_head * b_next;
 struct buffer_head * b_prev_free;
 struct buffer_head * b_next_free;
};

/* from fs/buffer.c */
void buffer_init(long buffer_end)
{
	struct buffer_head * h = start_buffer;
	void * b;
	int i;

	if (buffer_end == 1<<20)
		b = (void *) (640*1024);
	else
		b = (void *) buffer_end;
	while ( (b -= BLOCK_SIZE) >= ((void *) (h+1)) ) {
		h->b_dev = 0;
		h->b_dirt = 0;
		h->b_count = 0;
		h->b_lock = 0;
		h->b_uptodate = 0;
		h->b_wait = NULL;
		h->b_next = NULL;
		h->b_prev = NULL;
		h->b_data = (char *) b;
		h->b_prev_free = h-1;
		h->b_next_free = h+1;
		h++;
		NR_BUFFERS++;
        /* 跳过 640K ~ 1M 的显存和 BIOS RAM 部分 */
		if (b == (void *) 0x100000)
			b = (void *) 0xA0000;
	}
	h--;
	free_list = start_buffer;
	free_list->b_prev_free = h;
	h->b_next_free = free_list;
	for (i=0;i<NR_HASH;i++)
		hash_table[i]=NULL;
}

缓冲块的所有关键信息都由 buffer_head 数据结构进行记录, 至于有多少个 buffer_head? 只能说能划分多少就划分多少。

比较直观的结构信息如下

在 main.c 中已经提前确认了内核程序占用内存的大小，缓冲区大小以及主内存大小。

在高速缓冲区的开始位置，都用来存储 buffer_head 的信息，与每个缓冲块(从缓冲区结束位置开始分配)一一对应，直到中间某个位置不足以分配缓冲块为止。

另外的信息，就是可以看到一个 hash_table 数据结构了。

应该都能够想象，每个缓存块与外存中的数据块相对应，通过设备号 + 数据块号进行唯一定位。但是，如何快速查找需要操作的数据块是否已经存在与高速缓存中了呢? 显然直接遍历查找是不太可靠的办法。采用开放地址法的哈希表能够协助快速定位缓冲块。

挂载文件系统

既然高速缓冲区都准备就绪了，那么文件系统是否已经挂载了呢? 很遗憾，知道 main() 调用 init() 函数之前，文件系统依然没有挂载，也就是说，CPU 仍然只能通过基本输入输出对存储设备进行相当初级的 IO 操作。

那么，什么时候才能去挂载根目录呢?

/* from init/main.c */
/* 由 main() 触发 */
void init(void)
{
	int pid,i;
    /* 这是比较重要的一环了，开始挂载的起始动作 */
	setup((void *) &drive_info);
    ...
}

setup 函数做了什么呢？这是一个内嵌汇编，主要做的就是触发系统调用 int 0x80

__inline__ int setup(void * BIOS) { 
    long __res; 
    __asm__ volatile (
            "int $0x80" 
            : "=a" (__res) 
            : "0" (0),"b" ((long)(BIOS))
    ); 
    if (__res >= 0) 
        return (int) __res; 
    errno = -__res; 
    return -1; 
}

其中看到给出的 EAX = 0, 查表(表在 include/linux/sys.h 里) 可以知道触发的是 sys_setup 函数(函数位于 kernel/blk_drv/hd.c)

/* This may be used only once, enforced by 'static int callable' */
int sys_setup(void * BIOS)
{
	static int callable = 1;
	int i,drive;
	unsigned char cmos_disks;
	struct partition *p;
	struct buffer_head * bh;

    /* setup 只允许被调用一次 */
	if (!callable)
		return -1;
	callable = 0;
    /* 可以强制在源码中指定硬盘参数, 所以加了宏定义作判断*/
#ifndef HD_TYPE
	for (drive=0 ; drive<2 ; drive++) {
		hd_info[drive].cyl = *(unsigned short *) BIOS;
		hd_info[drive].head = *(unsigned char *) (2+BIOS);
		hd_info[drive].wpcom = *(unsigned short *) (5+BIOS);
		hd_info[drive].ctl = *(unsigned char *) (8+BIOS);
		hd_info[drive].lzone = *(unsigned short *) (12+BIOS);
		hd_info[drive].sect = *(unsigned char *) (14+BIOS);
		BIOS += 16;
	}
	if (hd_info[1].cyl)
		NR_HD=2;
	else
		NR_HD=1;
#endif
	for (i=0 ; i<NR_HD ; i++) {
		hd[i*5].start_sect = 0;
		hd[i*5].nr_sects = hd_info[i].head*
				hd_info[i].sect*hd_info[i].cyl;
	}

	/*
		We querry CMOS about hard disks : it could be that
		we have a SCSI/ESDI/etc controller that is BIOS
		compatable with ST-506, and thus showing up in our
		BIOS table, but not register compatable, and therefore
		not present in CMOS.

		Furthurmore, we will assume that our ST-506 drives
		<if any> are the primary drives in the system, and
		the ones reflected as drive 1 or 2.

		The first drive is stored in the high nibble of CMOS
		byte 0x12, the second in the low nibble.  This will be
		either a 4 bit drive type or 0xf indicating use byte 0x19
		for an 8 bit type, drive 1, 0x1a for drive 2 in CMOS.

		Needless to say, a non-zero value means we have
		an AT controller hard disk for that drive.


	*/

	if ((cmos_disks = CMOS_READ(0x12)) & 0xf0)
		if (cmos_disks & 0x0f)
			NR_HD = 2;
		else
			NR_HD = 1;
	else
		NR_HD = 0;
	for (i = NR_HD ; i < 2 ; i++) {
		hd[i*5].start_sect = 0;
		hd[i*5].nr_sects = 0;
	}
    /* 更进一步设置每个盘的参数 */
	for (drive=0 ; drive<NR_HD ; drive++) {
        /* 0x300 和 0x305 分别代表两个硬盘 */
        /* 读取每个硬盘的第一块数据 (1024B) */
		if (!(bh = bread(0x300 + drive*5,0))) {
			printk("Unable to read partition table of drive %d\n\r",
				drive);
			panic("");
		}
        /* 判断硬盘有效性 */
		if (bh->b_data[510] != 0x55 || (unsigned char)
		    bh->b_data[511] != 0xAA) {
			printk("Bad partition table on drive %d\n\r",drive);
			panic("");
		}
        /* 读取分区表 (位于 引导扇区第 446 字节开始处 */
		p = 0x1BE + (void *)bh->b_data;
		for (i=1;i<5;i++,p++) {
			hd[i+5*drive].start_sect = p->start_sect;
			hd[i+5*drive].nr_sects = p->nr_sects;
		}
		brelse(bh);
	}
	if (NR_HD)
		printk("Partition table%s ok.\n\r",(NR_HD>1)?"s":"");
	rd_load();              /* 尝试创建并加载虚拟盘 */
	mount_root();           /* mount 根文件系统 */
	return (0);
}

终于到了挂载文件系统的时候了

mount_root 用来做初级的初始化工作。同时，上一节已经描述过了，存储设备的超级块是用了记录整个文件系统最重要的结构。

那么通过将超级块的信息读取到内存，也就可以将内存与外存的文件系统相互联系起来了。

下面这段代码最重要的内容就是 read_super() 函数了 .

void mount_root(void)
{
	int i,free;
	struct super_block * p;
	struct m_inode * mi;

	if (32 != sizeof (struct d_inode))
		panic("bad i-node size");
    /* 先初始化文件表，该版本操作系统限制最大同时打开 NR_FILE(64个) 文件 */
	for(i=0;i<NR_FILE;i++)
        /* f_count = 0 表明没有被引用 */
		file_table[i].f_count=0;
    /* 如果引导盘是软盘的话，提示插入根文件系统盘 */
	if (MAJOR(ROOT_DEV) == 2) {
		printk("Insert root floppy and press ENTER");
		wait_for_keypress();
	}
    /* 初始化内存超级块数据结构 (总共 8 个) */
	for(p = &super_block[0] ; p < &super_block[NR_SUPER] ; p++) {
		p->s_dev = 0;
		p->s_lock = 0;
		p->s_wait = NULL;
	}
    /* Hint: 读取超级块的信息，挂载根文件系统重要的部分(代码请往下翻) */
	if (!(p=read_super(ROOT_DEV)))
		panic("Unable to mount root");
    /* 读取文件系统的 1 号i节点 (即该设备上文件系统的根节点) */
	if (!(mi=iget(ROOT_DEV,ROOT_INO)))
		panic("Unable to read root i-node");
	mi->i_count += 3 ;	/* NOTE! it is logically used 4 times, not 1 */
	p->s_isup = p->s_imount = mi;
    /* 应该还记得吧，current 指的是当前的任务(任务1)，以后所有的任务都会由任务1或任务1的子任务进行派生，也就意味着 current->root 会一直复制过去
     * 到这里为止，应该认为根文件系统以及被挂载了。是不是跟被耍了一样?
     */
	current->pwd = mi;
	current->root = mi;
	free=0;
	i=p->s_nzones;
    /* 统计还有多少空闲数据块以及多少可用i节点 (附上一张启动过程中打印的信息) */
	while (-- i >= 0)
		if (!set_bit(i&8191,p->s_zmap[i>>13]->b_data))
			free++;
	printk("%d/%d free blocks\n\r",free,p->s_nzones);
	free=0;
	i=p->s_ninodes+1;
	while (-- i >= 0)
		if (!set_bit(i&8191,p->s_imap[i>>13]->b_data))
			free++;
	printk("%d/%d free inodes\n\r",free,p->s_ninodes);
}

重要要的部分，read_super(int dev)，用于读取超级块的数据

static struct super_block * read_super(int dev)
{
	struct super_block * s;
	struct buffer_head * bh;
	int i,block;

	if (!dev)
		return NULL;
	check_disk_change(dev);
    /* 如果该超级块已经在内存中了，那么就直接使用 (和这里挂载根文件系统的流程无关) */
	if (s = get_super(dev))
		return s;
    /* 设备超级块不在内存中，就先找一个空闲的内存超级块 (总共维护 8 个超级块数据结构) */
	for (s = 0+super_block ;; s++) {
		if (s >= NR_SUPER+super_block)
			return NULL;
		if (!s->s_dev)
			break;
	}
	s->s_dev = dev;
	s->s_isup = NULL;
	s->s_imount = NULL;
	s->s_time = 0;
	s->s_rd_only = 0;
	s->s_dirt = 0;
	lock_super(s);
    /* 通过 block read 读取设备第一个物理块 (每个物理块 1024 B) */
	if (!(bh = bread(dev,1))) {
		s->s_dev=0;
		free_super(s);
		return NULL;
	}
    /* 复制一份超级块的数据 */
	*((struct d_super_block *) s) =
		*((struct d_super_block *) bh->b_data);
    /* 释放缓冲区的数据 */
	brelse(bh);
    /* 验证魔数, 该版本操作系统只支持 1.0 版 Minix 文件系统，魔数 0x137F */
	if (s->s_magic != SUPER_MAGIC) {
		s->s_dev = 0;
		free_super(s);
		return NULL;
	}
    /* 先清空内存中的数据 */
	for (i=0;i<I_MAP_SLOTS;i++)
		s->s_imap[i] = NULL;
	for (i=0;i<Z_MAP_SLOTS;i++)
		s->s_zmap[i] = NULL;
	block=2;
    /* 读取 i 节点位图块 */
	for (i=0 ; i < s->s_imap_blocks ; i++)
		if (s->s_imap[i]=bread(dev,block))
			block++;
		else
			break;
    /* 读取数据块位图 */
	for (i=0 ; i < s->s_zmap_blocks ; i++)
		if (s->s_zmap[i]=bread(dev,block))
			block++;
		else
			break;
	if (block != 2+s->s_imap_blocks+s->s_zmap_blocks) {
		for(i=0;i<I_MAP_SLOTS;i++)
			brelse(s->s_imap[i]);
		for(i=0;i<Z_MAP_SLOTS;i++)
			brelse(s->s_zmap[i]);
		s->s_dev=0;
		free_super(s);
		return NULL;
	}
	s->s_imap[0]->b_data[0] |= 1;
	s->s_zmap[0]->b_data[0] |= 1;
    /* 与前面的 wait_on_super() 对应(解开lock标志) */
	free_super(s);
	return s;
}

是不是觉得也没什么，就这样根文件系统已经挂载了? 毫无实感是吧。

Extra: 普通挂载

既然讲过了根文件系统的挂载。那就顺带着讲讲普通文件系统的挂载吧。

相信从命令上来讲应该比较简单也比较熟悉吧。mount disk.img /mnt 也算是挂载到 /mnt 下了

但是，究竟是怎么实现的呢?

int sys_mount(char * dev_name, char * dir_name, int rw_flag)
{
	struct m_inode * dev_i, * dir_i;
	struct super_block * sb;
	int dev;

    /** 
     * 省略大部分判断逻辑, 主要就是:
     * 1. 判断 dev_name 所属的设备号，读取该设备上的超级块
     * 2. 读取 dir_name (需要挂载到的位置)，判定是否允许被挂载(比如根节点不允许挂其它设备)
     */
    ...

    /* 设置超级块的 mount 标志 */
	sb->s_imount=dir_i;
    /* 设置该 i 节点的 mount 标志 */
	dir_i->i_mount=1;
	dir_i->i_dirt=1;		/* NOTE! we don't iput(dir_i) */
	return 0;			/* we do that in umount */
}

文件读写

前面讲了这么多，终于到了最关心的部分了。当然，也并不是说前面的内容不重要，事实上，相当重要，只是都隐藏在了内核引导的过程中，且调用频度低，才导致了没有存在感。但是这恰恰才是支持文件读写的基石。

不多说废话，下面就要开始文件读写的内容。

打开文件

打开文件的函数原型是 int open(const char * filename, int flag, ...);

当然，此类系统调用最终的实现都是 int 0x80 , 明确一个调用号，然后就陷入内核态了。

内核态下调用的函数是: int sys_open(const char * filename,int flag,int mode)

来看看细节:

int sys_open(const char * filename,int flag,int mode)
{
	struct m_inode * inode;
	struct file * f;
	int i,fd;

    /*
     * current 是由内核数据段维护的当前任务的指针
     * umask 是指当前任务在新建文件时的默认掩码
     *   例如 Linux 默认是 022, 即新建文件被禁止了组用户与其它用户的写权限
     * 这里是先确定新建文件的权限
     */
	mode &= 0777 & ~current->umask;
    /*
     * 文件描述符，每个文件单独维护一套，以数字标记
     * 找一个空闲的文件描述符项
     */
	for(fd=0 ; fd<NR_OPEN ; fd++)
		if (!current->filp[fd])
			break;
	if (fd>=NR_OPEN)
		return -EINVAL;
    /*
     * 顾名思义，设置在调用 exec() 函数时主动关闭的文件
     * exec() 通常与 fork() 联用，fork() 负责复制一个任务，而 exec 负责替换新任务的代码和数据段(从而产生一个新的任务)
     * 这里的声明即是复位 fd 位置的标志，允许子任务也持有相同的文件描述符项
     */
	current->close_on_exec &= ~(1<<fd);
	f=0+file_table;
    /* 在文件表中找一项空闲的 */
	for (i=0 ; i<NR_FILE ; i++,f++)
		if (!f->f_count) break;
	if (i>=NR_FILE)
		return -EINVAL;
    /* 当前任务的文件描述符项指向文件表项, 同时文件表项的引用计数+1*/
	(current->filp[fd]=f)->f_count++;
    /* 调用 open_namei 打开文件，如果失败则释放刚才占用的文件结构，并返回错误码 */
	if ((i=open_namei(filename,flag,mode,&inode))<0) {
		current->filp[fd]=NULL;
		f->f_count=0;
		return i;
	}
    /* 
     * 对不同的文件进行不同的特殊处理, 毕竟有 "一切皆文件" 的口号嘛
     * 诸如字符设备等也都是文件
     */
/* ttys are somewhat special (ttyxx major==4, tty major==5) */
	if (S_ISCHR(inode->i_mode))
		if (MAJOR(inode->i_zone[0])==4) {
			if (current->leader && current->tty<0) {
				current->tty = MINOR(inode->i_zone[0]);
				tty_table[current->tty].pgrp = current->pgrp;
			}
		} else if (MAJOR(inode->i_zone[0])==5)
			if (current->tty<0) {
				iput(inode);
				current->filp[fd]=NULL;
				f->f_count=0;
				return -EPERM;
			}
/* Likewise with block-devices: check for floppy_change */
	if (S_ISBLK(inode->i_mode))
		check_disk_change(inode->i_zone[0]);
    /* 初始化内存文件结构的各个参数 */
	f->f_mode = inode->i_mode;
	f->f_flags = flag;
	f->f_count = 1;
	f->f_inode = inode;
	f->f_pos = 0;
	return (fd);
}

在整个打开文件的过程中，除了真正去文件系统查找文件的时候用到了文件名，其它时候，都将是以文件描述符进行交互的。

再看看更细节的方面，毕竟就目前来说，我们跳过了最重要的一环 open_namei ，从而看似整个流程都简单了很多很多。

通常我们在编码过程中都是通过绝对地址/相对地址来唯一定位一个文件。因此，就必然存在逐级寻找文件的过程


static struct m_inode * get_dir(const char * pathname)
{
	char c;
	const char * thisname;
	struct m_inode * inode;
	struct buffer_head * bh;
	int namelen,inr,idev;
	struct dir_entry * de;

    /* 判定当前任务设定的根节点是否有效 */
	if (!current->root || !current->root->i_count)
		panic("No root inode");
    /* 判定当前路径i节点是否有效 */
	if (!current->pwd || !current->pwd->i_count)
		panic("No cwd inode");
    /* 
     * 这里的 get_fs_byte(..) 是宏定义，fs 指的是 FS 段寄存器
     * Linux 内核将 DS, ES 用于内核数据段, 用 FS 指向局部描述符表的当前任务数据段
     * 这里可以简单理解成取字符数组的第一个字节
     */
	if ((c=get_fs_byte(pathname))=='/') {
		inode = current->root;
		pathname++;
	} else if (c)
		inode = current->pwd;
	else
		return NULL;	/* empty name is bad */
	inode->i_count++;
	while (1) {
		thisname = pathname;
		if (!S_ISDIR(inode->i_mode) || !permission(inode,MAY_EXEC)) {
			iput(inode);
			return NULL;
		}
		for(namelen=0;(c=get_fs_byte(pathname++))&&(c!='/');namelen++)
			/* nothing */ ;
		if (!c)
			return inode;
		if (!(bh = find_entry(&inode,thisname,namelen,&de))) {
			iput(inode);
			return NULL;
		}
		inr = de->inode;
		idev = inode->i_dev;
		brelse(bh);
		iput(inode);
		if (!(inode = iget(idev,inr)))
			return NULL;
	}
}

/*
 *	dir_namei()
 *
 * 处理路径 pathname, 处理成i节点表示的最终一级目录+目录下文件名(也可能pathname表示的就是目录)
 */
static struct m_inode * dir_namei(const char * pathname, int * namelen, const char ** name)
{
	char c;
	const char * basename;
	struct m_inode * dir;

	if (!(dir = get_dir(pathname)))
		return NULL;
	basename = pathname;
	while (c=get_fs_byte(pathname++))
		if (c=='/')
			basename=pathname;
	*namelen = pathname-basename-1;
	*name = basename;
	return dir;
}

/*
 *	open_namei()
 *
 * namei for open - this is in fact almost the whole open-routine.
 */
int open_namei(const char * pathname, int flag, int mode,
	struct m_inode ** res_inode)
{
	const char * basename;
	int inr,dev,namelen;
	struct m_inode * dir, *inode;
	struct buffer_head * bh;
	struct dir_entry * de;

	if ((flag & O_TRUNC) && !(flag & O_ACCMODE))
		flag |= O_WRONLY;
	mode &= 0777 & ~current->umask;
	mode |= I_REGULAR;
	if (!(dir = dir_namei(pathname,&namelen,&basename)))
		return -ENOENT;
    /* 如果给的 pathname 是一个目录 */
	if (!namelen) {			/* special case: '/usr/' etc */
		if (!(flag & (O_ACCMODE|O_CREAT|O_TRUNC))) {
			*res_inode=dir;
			return 0;
		}
		iput(dir);
		return -EISDIR;
	}
    /* 找到目录对应的i节点的数据块 */
	bh = find_entry(&dir,basename,namelen,&de);
	if (!bh) {
		if (!(flag & O_CREAT)) {
			iput(dir);
			return -ENOENT;
		}
		if (!permission(dir,MAY_WRITE)) {
			iput(dir);
			return -EACCES;
		}
		inode = new_inode(dir->i_dev);
		if (!inode) {
			iput(dir);
			return -ENOSPC;
		}
		inode->i_uid = current->euid;
		inode->i_mode = mode;
		inode->i_dirt = 1;
		bh = add_entry(dir,basename,namelen,&de);
		if (!bh) {
			inode->i_nlinks--;
			iput(inode);
			iput(dir);
			return -ENOSPC;
		}
		de->inode = inode->i_num;
		bh->b_dirt = 1;
		brelse(bh);
		iput(dir);
		*res_inode = inode;
		return 0;
	}
	inr = de->inode;
	dev = dir->i_dev;
	brelse(bh);
	iput(dir);
	if (flag & O_EXCL)
		return -EEXIST;
	if (!(inode=iget(dev,inr)))
		return -EACCES;
	if ((S_ISDIR(inode->i_mode) && (flag & O_ACCMODE)) ||
	    !permission(inode,ACC_MODE(flag))) {
		iput(inode);
		return -EPERM;
	}
	inode->i_atime = CURRENT_TIME;
	if (flag & O_TRUNC)
		truncate(inode);
	*res_inode = inode;
	return 0;
}

文件写入

接下来就要进行文件写入的流程了

如何陷入内核态函数就不再细说了，相信看过这么多系统调用之后，也能知道基本上系统调用在内核态对应的函数都是以 sys_ 形式出现的

int sys_write(unsigned int fd,char * buf,int count)
{
	struct file * file;
	struct m_inode * inode;

    /* 非法 fd , 抛异常 */
	if (fd>=NR_OPEN || count <0 || !(file=current->filp[fd]))
		return -EINVAL;
    /* count = 0，无需写入数据 */
	if (!count)
		return 0;
	inode=file->f_inode;
    /* 针对不同的i节点类型，有不同的写入函数 */
	if (inode->i_pipe)
		return (file->f_mode&2)?write_pipe(inode,buf,count):-EIO;
	if (S_ISCHR(inode->i_mode))
		return rw_char(WRITE,inode->i_zone[0],buf,count,&file->f_pos);
	if (S_ISBLK(inode->i_mode))
		return block_write(inode->i_zone[0],&file->f_pos,buf,count);
	if (S_ISREG(inode->i_mode))
		return file_write(inode,file,buf,count);
	printk("(Write)inode->i_mode=%06o\n\r",inode->i_mode);
	return -EINVAL;
}

看看对于常规文件是怎么操作的吧。

int file_write(struct m_inode * inode, struct file * filp, char * buf, int count)
{
	off_t pos;      /* 偏移量 */
	int block,c;
	struct buffer_head * bh;
	char * p;
	int i=0;

    /* 如果是 Append 模式，把偏移量重置到文件末尾 */
	if (filp->f_flags & O_APPEND) 
		pos = inode->i_size;
    /* 否则就使用当前文件数据结构持有的偏移量 */
    /*
        附上数据结构  file 的内容 
        struct file {
        	unsigned short f_mode;
        	unsigned short f_flags;
        	unsigned short f_count;
        	struct m_inode * f_inode;
        	off_t f_pos;    每个打开的文件都将持有当前的偏移值
        };
     */
	else
		pos = filp->f_pos;
    /* 逐字符向缓冲区写入数据 */
	while (i<count) {
        /* 最开始当然是创建在磁盘上占用一个数据块了 (如果文件对应的块不存在的话) */
		if (!(block = create_block(inode,pos/BLOCK_SIZE)))
			break;
        /* 根据数据块获得相应的缓冲块 */
		if (!(bh=bread(inode->i_dev,block)))
			break;
        /* 在缓冲块中的偏移量 */
		c = pos % BLOCK_SIZE;
        /* 定位到具体的缓冲区的内存地址 */
		p = c + bh->b_data;
		bh->b_dirt = 1;
        /* 当前这个缓冲块还有多少字节可写 */
		c = BLOCK_SIZE-c;
        /* 如果需要写入的数据量少于 c */
		if (c > count-i) c = count-i;
        /* 添加偏移量计数, 更新数据结构中维护的值 */
		pos += c;
		if (pos > inode->i_size) {
			inode->i_size = pos;
			inode->i_dirt = 1;
		}
		i += c;
        /* 向缓冲块逐字节写入数据 */
		while (c-->0)
			*(p++) = get_fs_byte(buf++);
        /* 释放对缓冲块的占用，当然，在释放前会完成缓冲块<->外存数据块的同步 */
		brelse(bh);
	}
	inode->i_mtime = CURRENT_TIME;
	if (!(filp->f_flags & O_APPEND)) {
        /* 非 APPEND 模式，更新文件读写指针(偏移量); APPEND 模式是使用 inode->i_size ，所有就不需要在这里更新了 */
		filp->f_pos = pos;
		inode->i_ctime = CURRENT_TIME;
	}
	return (i?i:-1);
}

是不是觉得也没有什么太高大上的操作。确实如此，更多关于缓存块与文件系统数据块的同步都已经被包装到 bread(), brelse() 中了。

不过，暂时无需细究。总之到此为止，先要有一个基础的观念: 所有与外存储器(这里也包括控制台等)进行数据交互都必须经过缓冲区

缓冲区封装了对外存储器的全部操作，而提供给 CPU 更高效的 I/O 操作，当然，也更为简单快捷

文件读取

至于文件读取，也基本类似了，所以也就不再深入描述。

当然，要注意的就是，在本篇开始的部分提供的例程中，write & read 中插入了 off_t off = lseek(fd, 0, SEEK_SET); 这样的代码。

原因应该也能够想到，学习 sys_write(..) 的时候我们已经看到，任务对同一个文件在内存中维护了一个文件读写偏移量。因此，要读取刚才写入的内容，就不得不先改动这个读写偏移量了

int sys_read(unsigned int fd,char * buf,int count)
{
	struct file * file;
	struct m_inode * inode;

	if (fd>=NR_OPEN || count<0 || !(file=current->filp[fd]))
		return -EINVAL;
	if (!count)
		return 0;
	verify_area(buf,count);
	inode = file->f_inode;
	if (inode->i_pipe)
		return (file->f_mode&1)?read_pipe(inode,buf,count):-EIO;
	if (S_ISCHR(inode->i_mode))
		return rw_char(READ,inode->i_zone[0],buf,count,&file->f_pos);
	if (S_ISBLK(inode->i_mode))
		return block_read(inode->i_zone[0],&file->f_pos,buf,count);
	if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode)) {
		if (count+file->f_pos > inode->i_size)
			count = inode->i_size - file->f_pos;
		if (count<=0)
			return 0;
		return file_read(inode,file,buf,count);
	}
	printk("(Read)inode->i_mode=%06o\n\r",inode->i_mode);
	return -EINVAL;
}

小结

这篇对文件系统的代码层面的描述，仅仅只是捡了一个相当有限的片面 (常规文件读写)。从文件系统的挂载开始提供了一个读写的完整流程介绍(当然，很多细节是缺失的，不过不要着急)。

虽然平常都能够了解到一个比较模糊的前提，文件读写需要利用缓冲区，但是究竟什么是缓存区，如何使用都不会有太多的概念。本篇最大的重点，就是首先请读者们建立起一个基础性的对缓冲区的了解。事实上，这个中介在 I/O 中扮演了相当重要的角色。而且内存也为其提供了相当大的一份空间，差不多有 1/4 了。

跨了差不多半个多月来写，上下文的承接可能有些生硬了，甚至不一致了… 尴尬…

  __                    __                  
 / _| __ _ _ __   __ _ / _| ___ _ __   __ _ 
| |_ / _` | '_ \ / _` | |_ / _ \ '_ \ / _` |
|  _| (_| | | | | (_| |  _|  __/ | | | (_| |
|_|  \__,_|_| |_|\__, |_|  \___|_| |_|\__, |
                 |___/                |___/