File Systems in XV6

Brian Pan
14 min readMar 18, 2018

Main ideas for this post

  1. File System Architecture
  2. Crash Recovery
  3. Cache Mechanism

File System Architecture

layout of File System

Layout of File System

Notice: Block 0, 1, 2 are fixed

Block 0: Boot code

Block 1: Super Block. Store metadata about the file system

  • Size (# of the blocks)
  • # of data blocks
  • # of inodes
  • # of blocks in log
  • Besides, super block also fills in by a small program called mkfs (mkfs.c) which is an initial file system

Block 2: Log area. Use for transactions. Maintain consistency in case of a power outrage or system shutdown accidentally

Inodes: Unnamed files

Bitmap: An area to check which blocks are in use

Data area: Actual data located

Bottom up Approach

  • File descriptors
  • Recursive lookup
  • Directory Inodes
  • Inodes and Block allocator
  • Logging
  • Buffer Cache
  • Disk

Disk

Actual Disk

Buffer Cache (Block)

Purpose: Read & Write date from a block device

Goals:

  1. Synchronizations :
  • Synchronize disk blocks and ensure only one copy of a data block exist in the kernel
  • Make sure only one writer(kernel process) updates the copy at a time
  1. Caching:
  • Cache frequently used data in buffer for efficiency

Buffer definition (buf.h)

struct buf {
int flags; // buf status
uint dev; // device flag, we may have multiple devices
uint blockno; // block no on disk
struct sleeplock lock; // a lock for sleeplock
struct buf *prev; // LRU cache list
struct buf *next;
struct buf *qnext; // disk queue
uchar data[BSIZE]; // size of one buf is 512KB
};// buf status
#define B_VALID 0x2 // buffer has been read from disk
#define B_DIRTY 0x4 // buffer needs to be written to disk

Buffer cache definition (bio.c)

Notice: bcache.head is a dummy head the bcache.head.next is the actual most recent used buf

// the buffer cache structure
struct {
struct spinlock lock;
struct buf buf[NBUF]; // NBUF : MAXOPBLOCKS*3 size of the disk block cache , MAXOPBLOCKS = 10 : max # of blocks any FS op writes
// Linked list of all buffers, through prev/next.
// head.next is most recently used.
struct buf head;
} bcache;

Interfaces of buffer cache

  • bget: find an available buffer in cache line
  • bread: obtain a copy of a disk block for reading
  • bwrite: write a modified copy into a block
  • brelease: only one thread holds access and the others are blocked and waited until the the current thread released (brelease)

Code Snippets

// Return a locked buf with the contents of the indicated block.
struct buf*
bread(uint dev, uint blockno)
{
struct buf *b;
b = bget(dev, blockno);
// not valid
if( (b->flags & B_VALID) == 0 ) {
// need to read from disk
iderw(b);
}
return b;
}
// write b's content to disk. Must be locked !!
void
bwrite(struct buf *b)
{
// must hold the sleeplock
if(!holdingsleep(&b->lock))
panic("bwrite");
b->flags = |= B_DIRTY;
iderw(b);
}
// Look through buffer cache for block on device dev.
// If not found, allocate a buffer.
// In either case, return locked buffer.
static struct buf*
bget(uint dev, uint blockno)
{
struct buf *b;

acquire(&bcache.lock);

// Is the block really cached?
for(b = bcache.head.next; b != &bcache.head; b=b->next){
if(b->dev == dev && b->blockno == blockno) {
b->refcnt++; //referenced counter ++
release(&bcache.lock);
acquiresleep(&b.lock);
// return as a static buf ptr
return b;
}
}

// Not cached; recycle an unused buffer.
// Even if refcnt==0, B_DIRTY indicates a buffer is in use
// because log.c has modified it but not yet committed it.
// LRU from last to head
for(b=bcache.head.prev; b != &bcache.head; b=b.prev) {
if(b->refcnt==0 && (b->flags & B_DIRTY)==0) {
b->dev = dev;
b->blockno = blockno;
b->flags = 0;
b->refcnt = 1;
release(&bcache.lock);
acquiresleep(&b->lock);
return b;
}
}
panic("bget: no buffers");
}// Release a locked buffer.
// Move to the head of the MRU list
void
brelse(stuct buf *b)
{
if(!holdingsleep(&b->lock))
panic("brelse");

releasesleep(&b->lock);

acquire(&bcache.lock);
b->refcnt--;
if(b->refcnt == 0) {
// no one is waiting for it.
// linked list remove b
b->next->prev = b->prev;
b->prev->next = b->next;

// put to bcache.head.next
b->next = bcache.head.next;
b->prev = &bcache.head;
// as the head of bcache
bcache.head.next->prev = b;
bcache.head.next = b;
}

release(&bcache.lock);
}

Example: install_trans(void) -> commit data from log to their home destination

static void
install_trans(void)
{
int tail;
// log.lh comes from read_head
// bread(log.dev, log.start);
// struct logheader *lh = (struct logheader *)(buf->data);
for(tail = 0; tail < log.lh.n;tail++) {
struct buf *lbuf = bread(log.dev, log.start+tail+1); // read log block
struct buf *dbuf = bread(log.dev, log.lh.block[tail]); //read dest
memmove(dbuf->data, lbuf->data, BSIZE);
bwrite(dbuf); // write dst to disk
brelse(lbuf);
brelse(dbuf);
}
}

Logging (Transactions)

Purpose: Do transactions to achieve crash recovery. Transactions mean group multiple writes into one bundle

Goal: For Consistency

To be more specific, file system operations involve multiple writes to disk. Once the system crashes, it may leave the inconsistent state of the file system.

Ex: During file deletion, it can leave file system with

  • Leave an allocated but unreferenced block
  • Leave an inode referenced to a content block that is marked free

Solution

Main idea: Do not write to disk directly !

Instead,

  1. Logged data into journel
  2. Once all writes are logged, the system writes a special commit record
  • Indicated that the log contains a complete transaction

3. Then, file system copies those logs to on disk location. (install_trans talked in last example)

  • Erase data in log after complete

Recovery Methodology

  1. After rebooting, copy the log if the status is marked as complete
  2. Otherwise, discard all writes in log

Interfaces of Logging

  • begin_op: called at the start of each FS system call
  • log_write: caller modified buffer as B_DIRTY and data. Then, write the buffer’s block id into log header
  • end_op: caller calls at the end of each FS system call. commits it if this is the last outstanding operation
  • commit: process from cache->log, header to disk, log->disk
  • write_log: write modified data from cache to log
  • write_head: write log header’s block num info from log header to disk
  • install_trans: write data from log to to blockno in disk

Code Snippet (log.c)

// used structs
struct logheader {
int n; // # of the logged block
int block[LOGSIZE]; # the logged block's blockno
}
struct log {
struct spinlock lock;
int start;
int size;
int outstanding; // how many FS sys calls are executing.
int committing; // in commit(), please wait.
int dev;
struct logheader lh;
}
struct log log;// called at the start of each FS system call.
void
begin_op(void)
{
acquire(&log.lock);
while(1) {
if(log.committing) {
sleep(&log, &log.lock);
} else if(log.lh.n + (log.outstanding+1)*MAXOPBLOCKS > LOGSIZE) {
// this op might exhaust log space; wait for commit.
sleep(&log, &log.lock);
} else {
log.outstanding += 1;
release(&log.lock);
break;
}
}
// Caller has modified b->data and is done with the buffer.
// Record the block number and pin in the cache with B_DIRTY.
// commit()/write_log() will do the disk write.
//
// log_write() replaces bwrite(); a typical use is:
// bp = bread(...)
// modify bp->data[]
// log_write(bp)
// brelse(bp)
void
log_write(struct buf *b)
{
int i;
if(log.lh.n >= LOGSIZE || log.lh.n >= log.size - 1)
panic("too big a transaction");
if(log.outstanding < 1)
panic("log_write outside of trans");

acquire(&log.lock);
for(i=0; i<log.lh.n;i++) {
if(log.lh.block[i] == b->blockno) //log absorbtion
break;
}
log.lh.block[i] = b->blockno;
if(i == log.lh.n)
log.lh.n++ // increase logged block size
b->flags |= B_DIRTY;
release(&log.lock);
}
// called at the end of each FS system call.
// commits if this was the last outstanding operation.
void
end_op(void) {
int do_commit = 0;

acquire(&log.lock);
log.outstanding -= 1;
if(log.committing)
panic("log committing");
if(log.outstanding == 0) {
do_commit = 1; // no anyone do transaction
log.committing = 1;
} else {
/ begin_op() may be waiting for log space,
// and decrementing log.outstanding has decreased
// the amount of reserved space.
wakeup(&log);
}
release(&log.lock);

if(do_commit){
// call commit w/o holding locks, since not allowed
// to sleep with locks.
commit();
acquire(&lock.lock);
log.committing = 0;
wakeup(&log);
release(&log.lock);
}
}

more on code snippets (Commit)

Notice: log.start+tail+1 : plus one is for log header

static void
commit()
{
if(log.lh.n > 0) {
write_log(); // Write modified blocks from cache to log
write_head(); // Write header to disk -- the real commit
install_trans(); // Now install writes to home locations
log.lh.n = 0;
write_head(); // Erase the transaction from the log
}
}
// Copy modified blocks from cache to log
static void
write_log(void)
{
int tail;
// loop the log
for(tail=0;tail<log.n.h;tail++) {
struct buf *to = bread(log.dev, log.start+tail+1); // log block
struct buf *from = bread(log.dev,log.lh.block[tail]); // cache block
memove(to->data, from->data, BSIZE);
bwrite(to);
brelse(from);
brelse(to);
}
}
// Write in-memory log header to disk.
// This is the true point at which the
// current transaction commits.
static void
write_head(void)
{
struct buf *buf = bread(log.dev, log.start);

struct logheader *hb = (struct logheader *) (buf->data);
int i;
hb->n = log.lh.n;
// put logheader's block into disk
for(i=0;i<hb->n;i++) {
hb->block[i] = log.lh.block[i];
}
bwrite(buf);
brelse(buf);
}
static void
install_trans(void)
{
int tail;
// log.lh comes from read_head
// bread(log.dev, log.start);
// struct logheader *lh = (struct logheader *)(buf->data);
for(tail = 0; tail < log.lh.n;tail++) {
struct buf *lbuf = bread(log.dev, log.start+tail+1); // read log block
struct buf *dbuf = bread(log.dev, log.lh.block[tail]); //read dest
memmove(dbuf->data, lbuf->data, BSIZE);
bwrite(dbuf); // write dst to disk
brelse(lbuf);
brelse(dbuf);
}
}

To be mention, the brief process of transactions is:

  1. log_write: writes modified blocks’ blockno into log header

2. write_log: write modified data in cache into log

3. write_head: write log header’s blockno info into disk

4. install_trans: write log data into real dest

5. update there is no block waits for transaction :

  • log.lh.n = 0
  • log.committing = 0

File Inodes (Files)

Purpose: Unnamed files represent as Inodes. Usually, inodes are consisted of several blocks

Inode

  • On disk: Holds metadata -> File type, size, # of links referring to it, list of blocks with data
  • In memory: A copy of an on-disk inode + some additional kernel information

-> Reference counter: ip->ref

  • > Synchronization Flag: ip->flags
Inode representation in disk
  1. Inodes stores as an array on disk
  • point by sb.startinode

2. Each inodes has a number representing the position on disk

3. The kernel keeps a cache of inode in memory (Synchronization issue)

struct dinode (fs.h)

Notice: addrs[NDIRECT+1] because there is one indirect entry in a inode

// On-disk inode structure
struct dinode {
short type; // File type
short major; // Major device number (T_DEV only)
short minor; // Minor device number (T_DEV only)
short nlink; // Number of links to inode in file system
uint size; // Size of file (bytes)
uint addrs[NDIRECT+1]; // Data block addresses
};

Calculate max size of inode structure

One layer Indirect inode example:

BSIZE = 512B

Direct address Number = 12

Direct data size = 12*512B

Indirect block has 512B/4B = 128 entries

Each entry point to 512B data

That is indirect data size is 128*512B

Total size is 140*512B = 71680B

Figure representing one layer indirect inode

struct in-memory copy of inode (file.h)

// in-memory copy of an inode
struct inode {
// more info for kernel
uint dev; // Device number
uint inum; // Inode number
int ref; // Reference count
struct sleeplock lock; // protects everything below here
int valid; // inode has been read from disk?
short type; // copy of disk inode
short major;
short minor;
short nlink;
uint size;
uint addrs[NDIRECT+1];
};

struct of icache(fs.c)

struct {
struct spinlock lock;
struct inode inode[NINODE]; // NINODE = 50 in param.h
} icache;

Inode Life Cycle

  • Allocation on disk
  1. ialloc()
  2. iput() // dealloc
  • Referencing
  1. ip->ref tracks # of active pointers to an inode in memory

Typical accessing inode example in Xv6 source code

ip = iget(dev, inum);
ilock(ip);
... examine and modify ip->xxx ...
iunlock(ip);
iput(ip);

Interfaces of Inode

  • iget: find the inode with number inum on device dev and return an in-memory copy (not copy the data already, just do some kernel info setup)
  • readi: read inode
  • writei: write inode
// Find the inode with number inum on device dev
// and return the in-memory copy. Does not lock
// the inode and does not read it from disk.
static struct inode*
iget(uint dev, uint inum)
{
struct inode *ip, *empty;

acquire(&icache->lock);

// Is the inode already cached?
for(ip=&icache.inode[0]; i<&inoce.inode[NINODE];ip++) {
if(ip->ref > 0 && ip->dev == dev && ip->num == inum) {
ip->ref++;
release(&icache->lock);
return ip;
}
if(empty == 0 && ip->ref == 0)
empty = ip; // Remember empty slot.
}

if(empty == 0)
panic("iget: no inodes");

ip = empty;
ip->dev = dev;
ip->inum = inum;
ip->ref = 1;
ip->valid = 0;
release(&icache.lock);

return ip;
}
// Read data from inode.
// Caller must hold ip->lock.
int
readi(struct inode *ip, char *dst, uint off, uint n)
{
uint tot, m;
struct buf *bp;

if(ip->type == T_DEV){
if(ip->major < 0 || ip->major >= NDEV || devsw[ip->major].read)
return -1;
return devsw[ip->major].read(ip, dst, n);
}

// offset can not over inode size or overflow
if(off > ip->size || off+n < off)
return -1;
// trim read size because n over the limit
if(off+n > ip->size)
n = ip->size - off;
// for loop to read from block
for(tot=0;tot<n;tot+=m,off+=m, dst+=m) {
// bmap Return the disk block address of the nth block in inode ip which is the blockno
bp = bread(ip->dev, bmap(ip, off/BSIZE));
// get block size or if is in the last only pick up the rest
// also should not include offset
m = min(n-tot, BSIZE -off%BSIZE);
memmove(dst, bp->data + off%BSIZE, m);
brelse(bp);
}
return n;
}
// Write data to inode.
// Caller must hold ip->lock.
int
writei(struct inode *ip, char *src, uint, off, uint n)
{
uint tot, m;
struct buf *bp;

if(ip->type == T_DEV){
if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].write)
return -1;
return devsw[ip->major].write(ip, src, n);
}

if(off > ip->size || off + n < off)
return -1;
if(off+n > MAXFILE*BSIZE)
return -1;
// write data to dst by BSIZE
for(tot=0;tot<n;tot+=m,off+=m, dst+=m) {
bp = bread(ip->dev, bmap(ip, off/BSIZE));
m = min(n-tot, BSIZE - off%BSIZE);
memmove(bp->data + off%BSIZE, src, m);
// write to log header
log_write(bp);
brelse(bp);
}
// update inode size
if(n > 0 && off > ip->size) {
ip->size = off;
iupdate(ip);
}
return n;
}

bmap : Solve the issue how to get the blockno from inode given blockno in inode (fs.c)

static uint 
bmap(struct inode *ip, uint bn)
{
uint addr, *a;
struct buf *bp;
// is direct address
if(bn <NDIRECT) {
if((addr = ip->addrs[bn]) == 0)
ip->addrs[bn] = balloc(ip->dev);
return addr
}

bn -= NDIRECT;
// IDIRECT 0-127
if(bn < NINDIRECT){
// Load indirect block, allocating if necessary.
if((addr = ip->addrs[NDIRECT]) == 0)
ip->addrs[NDIRECT] = addr = balloc(ip->dev);
bp = bread(ip->dev, addr);
// indirect index block(0-127)
a = (uint*) bp->data;
if((addr = a[bn]) == 0){
a[bn] = addr = balloc(ip-dev);
log_write(bp);
}
brelse(bp);
return addr;
}

panic("bmap: out of range");
}

file struct

// file.h
// a file can be a pipe or an inode
// each file has a current offset
struct file {
enum {FD_NONE, FD_PIPE, FD_INODE} type;
int ref; // reference count
char readable;
char writable;
struct pipe *pipe;
struct inode *ip;
uint off;
}

file.c

// Read from file f
int
fileread(struct file *f, char *addr, int n)
{
int r;
if(f->readable == 0)
return -1;
if(f->type == FD_TYPE)
return piperead(f->pipe, addr, n);
if(f->type == FD_INODE){
ilock(f->ip);
if((r = readi(f->ip, addr, f->off, n)) > 0)
f->off += r;
iunlock(f->ip);
return r;
}
panic("fileread");
}

The last in this inode section is that I want to illustrate more about how to allocate a zero disk block for a certain device (balloc fs.c)

static uint
balloc(uint dev)
{
int b, bi, m;
struct buf *bp;

bp = 0;
// BPB is Bitmap bits per block =>BSIZE * 8
for(b=0; b< sb.size;b+=BPB) {
// Block of free map containing bit for block b
// #define BBLOCK(b, sb) (b/BPB + sb.bmapstart)
bp = bread(dev, BBLOBK(b, sb));

//Check every bit (bi) of a block
for(bi=0;bi<BPB && b+bi<sb.size;bi++){
m = 1<<(bi%8);
if((bp->data[bi/8] & m) == 0){ // Is block free?
bp->data[bi/8] |= m; // Mark block in use.
log_write(bp);
brelse(bp);
bzero(dev, b+bi);
return b+bi;
}
}
brelse(bp);
}
panic("balloc: out of blocks");
}

Directory Inodes (Directory)

Purpose: Directories are special inodes. Those inodes contain names of directories and a pointer to an unnamed inode.

  • Dirname is max of 14 chars
  • Has a special inode type T_DIR

struct dirent (fs.h)

#define DIRSIZE 14
struct dirent {
ushort inum;
char name[DIRSIZ];
};

Interfaces of Directory Inode

  • dirlookup: Search for a directory given directory name
  • dirlink: Add a new file to a directory

Now that us talk more about implementation of the interface

// Look for a directory entry in a directory.
// If found, set *poff to byte offset of entry.
struct inode*
dirlookup(struct inode *dp, char *name, uint *poff)
{
uint off, inum;
struct dirent de;
// loop inode to find the matched directory entry
for(off=0;off<dp->size;off+=sizeof(de)) {
if(readi(dp, (char *)&de, off, sizeof(de)) != sizeof(de))
panic("dirlookup read");
if(de.inum == 0)
continue;
// compare name is matched
if(namecmp(name, de.name) == 0){
if(poff)
*poff = off;
inum = de.inum;
return iget(dp->dev, inum); // get inode
}
}
return 0;
}

Recursive Lookup (Pathname)

Purpose: Hierarchical path like /usr/bin/sh that can look up recursively by ‘/’

Interface

  • namei: Resolve a path into inode

> if started with ‘/’, that means resolve from root path

> otherwise, start from current dir

namei (fs.c)

struct inode*
namei(char *path)
{
char name[DIRSIZ];
return namex(path, 0, name);
}
// Look up and return the inode for a path name.
// If parent != 0, return the inode for the parent and copy the final
// path element into name, which must have room for DIRSIZ bytes.
// Must be called inside a transaction since it calls iput().
static struct inode*
namex(char *path, int nameiparent, char *name)
{
struct inode *ip, *next;

// root path or cwd
if(*path == '/')
ip = iget(ROOTDEV, ROOTINO);
else
ip = idup(myproc()->cwd);
// skipelem("a/bb/c", name) = "bb/c", setting name = "a"
while((path = skipelem(path, name)) != 0) {
ilock(ip);
if(ip->type != T_DIR) {
iunlockput(ip);
return 0;
}
if(nameiparent && *path == '\0') {
// Stop one level early.
iunlock(ip);
return ip;
}
// dir not found
if((next=dirlookup(ip, name, 0)) == 0){
iunlockput(ip);
return 0;
}
iunlockput(ip);
ip = next;
}
if(nameiparent){
iput(ip);
return 0;
}
return ip;
}

File Descriptors (System Call)

Purpose: File descriptors are abstract resources as Files like Files, Sockets, Devices, Pipes

file descriptor illustration

struct of file

// proc.h
struct proc {
...
// Open files
struct file *ofile[NOFILE];
...
}
file descriptor points to pipe

Interface of File Descriptor

  • fdalloc : Allocate a file descriptor for a given file
// sysfile.c
static int
fdalloc(struct file *f)
{
int fd;
struct proc *curproc = mycpu();
// NOFILE is max files open for a process
for(fd=0;fd<NOFILE;fd++){
if(curproc->ofile[fd] == 0) {
curproc->ofile[fd] = f;
return fd;
}
}
return -1;
}

Reference

  1. brilliant OS course by Anton Burtsev in UC Irvine

--

--