File Systems in XV6

Brian Pan

14 min readMar 18, 2018

Main ideas for this post

File System Architecture
Crash Recovery
Cache Mechanism

File System Architecture

layout of File System

Notice: Block 0, 1, 2 are fixed

Block 0: Boot code

Block 1: Super Block. Store metadata about the file system

Size (# of the blocks)
# of data blocks
# of inodes
# of blocks in log
Besides, super block also fills in by a small program called mkfs (mkfs.c) which is an initial file system

Block 2: Log area. Use for transactions. Maintain consistency in case of a power outrage or system shutdown accidentally

Inodes: Unnamed files

Bitmap: An area to check which blocks are in use

Data area: Actual data located

Bottom up Approach

File descriptors
Recursive lookup
Directory Inodes
Inodes and Block allocator
Logging
Buffer Cache
Disk

Disk

Actual Disk

Buffer Cache (Block)

Purpose: Read & Write date from a block device

Goals:

Synchronizations :

Synchronize disk blocks and ensure only one copy of a data block exist in the kernel
Make sure only one writer(kernel process) updates the copy at a time

Caching:

Cache frequently used data in buffer for efficiency

Buffer definition (buf.h)

struct buf {
  int flags; // buf status
  uint dev; // device flag, we may have multiple devices
  uint blockno; // block no on disk
  struct sleeplock lock; // a lock for sleeplock
  struct buf *prev; // LRU cache list
  struct buf *next;
  struct buf *qnext; // disk queue
  uchar data[BSIZE]; // size of one buf is 512KB};// buf status
#define B_VALID 0x2  // buffer has been read from disk
#define B_DIRTY 0x4  // buffer needs to be written to disk

Buffer cache definition (bio.c)

Notice: bcache.head is a dummy head the bcache.head.next is the actual most recent used buf

// the buffer cache structure
struct {
  struct spinlock lock;
  struct buf buf[NBUF]; // NBUF : MAXOPBLOCKS*3 size of the disk block cache ,  MAXOPBLOCKS = 10 : max # of blocks any FS op writes  // Linked list of all buffers, through prev/next.
  // head.next is most recently used.
  struct buf head;
} bcache;

Interfaces of buffer cache

bget: find an available buffer in cache line
bread: obtain a copy of a disk block for reading
bwrite: write a modified copy into a block
brelease: only one thread holds access and the others are blocked and waited until the the current thread released (brelease)

Code Snippets

// Return a locked buf with the contents of the indicated block.
struct buf*
bread(uint dev, uint blockno)
{
  struct buf *b;
  b = bget(dev, blockno);
  // not valid
  if( (b->flags & B_VALID) == 0 ) {
    // need to read from disk
    iderw(b);
  }
  return b;
}// write b's content to disk. Must be locked !!
void
bwrite(struct buf *b)
{
  // must hold the sleeplock
  if(!holdingsleep(&b->lock))
    panic("bwrite");
  b->flags = |= B_DIRTY;
  iderw(b);
}// Look through buffer cache for block on device dev.
// If not found, allocate a buffer.
// In either case, return locked buffer.
static struct buf*
bget(uint dev, uint blockno)
{
  struct buf *b;
  
  acquire(&bcache.lock);
  
  // Is the block really cached?
  for(b = bcache.head.next; b != &bcache.head; b=b->next){
    if(b->dev == dev && b->blockno == blockno) {
      b->refcnt++; //referenced counter ++
      release(&bcache.lock);
      acquiresleep(&b.lock);
      // return as a static buf ptr
      return b;
    }
  }
  
  // Not cached; recycle an unused buffer.
  // Even if refcnt==0, B_DIRTY indicates a buffer is in use
  // because log.c has modified it but not yet committed it.
  // LRU from last to head
  for(b=bcache.head.prev; b != &bcache.head; b=b.prev) {
    if(b->refcnt==0 && (b->flags & B_DIRTY)==0) {
      b->dev = dev;
      b->blockno = blockno;
      b->flags = 0;
      b->refcnt = 1;
      release(&bcache.lock);
      acquiresleep(&b->lock);
      return b;
    }
  }
  panic("bget: no buffers");}// Release a locked buffer.
// Move to the head of the MRU list
void
brelse(stuct buf *b)
{
  if(!holdingsleep(&b->lock))
    panic("brelse");
  
  releasesleep(&b->lock);
  
  acquire(&bcache.lock);
  b->refcnt--;
  if(b->refcnt == 0) {
    // no one is waiting for it.
    // linked list remove b
    b->next->prev = b->prev;
    b->prev->next = b->next;
    
    // put to bcache.head.next
    b->next = bcache.head.next;
    b->prev = &bcache.head;    // as the head of bcache
    bcache.head.next->prev = b;
    bcache.head.next = b;
  }
  
  release(&bcache.lock);
}

Example: install_trans(void) -> commit data from log to their home destination

static void
install_trans(void)
{
  int tail;
  // log.lh comes from  read_head
  // bread(log.dev, log.start);
  // struct logheader *lh = (struct logheader *)(buf->data);  for(tail = 0; tail < log.lh.n;tail++) {
    struct buf *lbuf = bread(log.dev, log.start+tail+1); // read log block
    struct buf *dbuf = bread(log.dev, log.lh.block[tail]); //read dest
    memmove(dbuf->data, lbuf->data, BSIZE);
    bwrite(dbuf); // write dst to disk
    brelse(lbuf);
    brelse(dbuf);
  }
}

Logging (Transactions)

Purpose: Do transactions to achieve crash recovery. Transactions mean group multiple writes into one bundle

Goal: For Consistency

To be more specific, file system operations involve multiple writes to disk. Once the system crashes, it may leave the inconsistent state of the file system.

Ex: During file deletion, it can leave file system with

Leave an allocated but unreferenced block
Leave an inode referenced to a content block that is marked free

Solution

Main idea: Do not write to disk directly !

Instead,

Logged data into journel
Once all writes are logged, the system writes a special commit record

Indicated that the log contains a complete transaction

3. Then, file system copies those logs to on disk location. (install_trans talked in last example)

Erase data in log after complete

Recovery Methodology

After rebooting, copy the log if the status is marked as complete
Otherwise, discard all writes in log

Interfaces of Logging

begin_op: called at the start of each FS system call
log_write: caller modified buffer as B_DIRTY and data. Then, write the buffer’s block id into log header
end_op: caller calls at the end of each FS system call. commits it if this is the last outstanding operation
commit: process from cache->log, header to disk, log->disk
write_log: write modified data from cache to log
write_head: write log header’s block num info from log header to disk
install_trans: write data from log to to blockno in disk

Code Snippet (log.c)

// used structs
struct logheader {
  int n; // # of the logged block
  int block[LOGSIZE]; # the logged block's blockno
}struct log {
  struct spinlock lock;
  int start;
  int size;
  int outstanding; // how many FS sys calls are executing.
  int committing;  // in commit(), please wait.
  int dev;
  struct logheader lh;
}struct log log;// called at the start of each FS system call.
void
begin_op(void)
{
  acquire(&log.lock);
  while(1) {
    if(log.committing) {
      sleep(&log, &log.lock);
    } else if(log.lh.n + (log.outstanding+1)*MAXOPBLOCKS > LOGSIZE) {
      // this op might exhaust log space; wait for commit.
      sleep(&log, &log.lock);
    } else {
      log.outstanding += 1;
      release(&log.lock);
      break;
    }
}// Caller has modified b->data and is done with the buffer.
// Record the block number and pin in the cache with B_DIRTY.
// commit()/write_log() will do the disk write.
//
// log_write() replaces bwrite(); a typical use is:
//   bp = bread(...)
//   modify bp->data[]
//   log_write(bp)
//   brelse(bp)void
log_write(struct buf *b)
{
  int i;
  if(log.lh.n >= LOGSIZE || log.lh.n >= log.size - 1)
    panic("too big a transaction");
  if(log.outstanding < 1)
    panic("log_write outside of trans");
  
  acquire(&log.lock);
  for(i=0; i<log.lh.n;i++) {
    if(log.lh.block[i] == b->blockno) //log absorbtion
      break;
  }
  log.lh.block[i] = b->blockno;
  if(i == log.lh.n)
    log.lh.n++ // increase logged block size
  b->flags |= B_DIRTY;
  release(&log.lock);
}// called at the end of each FS system call.
// commits if this was the last outstanding operation.
void 
end_op(void) {
  int do_commit = 0;
  
  acquire(&log.lock);
  log.outstanding -= 1;
  if(log.committing)
    panic("log committing");
  if(log.outstanding == 0) {
    do_commit = 1; // no anyone do transaction
    log.committing = 1;
  } else {
    / begin_op() may be waiting for log space,
    // and decrementing log.outstanding has decreased
    // the amount of reserved space.
    wakeup(&log);
  }
  release(&log.lock);
  
  if(do_commit){
    // call commit w/o holding locks, since not allowed
    // to sleep with locks.
    commit();
    acquire(&lock.lock);
    log.committing = 0;
    wakeup(&log);
    release(&log.lock);
  }
}

File Inodes (Files)

Purpose: Unnamed files represent as Inodes. Usually, inodes are consisted of several blocks

Inode

On disk: Holds metadata -> File type, size, # of links referring to it, list of blocks with data
In memory: A copy of an on-disk inode + some additional kernel information

-> Reference counter: ip->ref

> Synchronization Flag: ip->flags

Inodes stores as an array on disk

point by sb.startinode

2. Each inodes has a number representing the position on disk

3. The kernel keeps a cache of inode in memory (Synchronization issue)

struct dinode (fs.h)

Notice: addrs[NDIRECT+1] because there is one indirect entry in a inode

// On-disk inode structure
struct dinode {
  short type;           // File type
  short major;          // Major device number (T_DEV only)
  short minor;          // Minor device number (T_DEV only)
  short nlink;          // Number of links to inode in file system
  uint size;            // Size of file (bytes)
  uint addrs[NDIRECT+1];   // Data block addresses 
};

Calculate max size of inode structure

One layer Indirect inode example:

BSIZE = 512B

Direct address Number = 12

Direct data size = 12*512B

Indirect block has 512B/4B = 128 entries

Each entry point to 512B data

That is indirect data size is 128*512B

Total size is 140*512B = 71680B

Figure representing one layer indirect inode

struct in-memory copy of inode (file.h)

// in-memory copy of an inode
struct inode {
  // more info for kernel
  uint dev;           // Device number
  uint inum;          // Inode number
  int ref;            // Reference count
  struct sleeplock lock; // protects everything below here
  int valid;          // inode has been read from disk?  short type;         // copy of disk inode
  short major;
  short minor;
  short nlink;
  uint size;
  uint addrs[NDIRECT+1];
};

struct of icache(fs.c)

struct {
  struct spinlock lock;
  struct inode inode[NINODE]; // NINODE = 50 in param.h
} icache;

Inode Life Cycle

Allocation on disk

ialloc()
iput() // dealloc

Referencing

ip->ref tracks # of active pointers to an inode in memory

Typical accessing inode example in Xv6 source code

ip = iget(dev, inum);
ilock(ip);
... examine and modify ip->xxx ...
iunlock(ip);
iput(ip);

Interfaces of Inode

iget: find the inode with number inum on device dev and return an in-memory copy (not copy the data already, just do some kernel info setup)
readi: read inode
writei: write inode

// Find the inode with number inum on device dev
// and return the in-memory copy. Does not lock
// the inode and does not read it from disk.
static struct inode*
iget(uint dev, uint inum)
{
  struct inode *ip, *empty;
  
  acquire(&icache->lock);
  
  // Is the inode already cached?
  for(ip=&icache.inode[0]; i<&inoce.inode[NINODE];ip++) {
    if(ip->ref > 0 && ip->dev == dev && ip->num == inum) {
      ip->ref++;
      release(&icache->lock);
      return ip;
    }
    if(empty == 0 && ip->ref == 0)
      empty = ip; // Remember empty slot.
  }
  
  if(empty == 0)
    panic("iget: no inodes");
  
  ip = empty;
  ip->dev = dev;
  ip->inum = inum;
  ip->ref = 1;
  ip->valid = 0;
  release(&icache.lock);
  
  return ip;
}// Read data from inode.
// Caller must hold ip->lock.
int
readi(struct inode *ip, char *dst, uint off, uint n)
{
  uint tot, m;
  struct buf *bp;
  
  if(ip->type == T_DEV){
    if(ip->major < 0 || ip->major >= NDEV || devsw[ip->major].read)
      return -1;
    return devsw[ip->major].read(ip, dst, n);
  }
  
  // offset can not over inode size or overflow
  if(off > ip->size || off+n < off)
    return -1;
  // trim read size because n over the limit
  if(off+n > ip->size)
    n = ip->size - off;// for loop to read from block
  for(tot=0;tot<n;tot+=m,off+=m, dst+=m) {
    // bmap Return the disk block address of the nth block in inode ip which is the blockno
    bp = bread(ip->dev, bmap(ip, off/BSIZE));
    // get block size or if is in the last only pick up the rest
    // also should not include offset
    m = min(n-tot, BSIZE -off%BSIZE);
    memmove(dst, bp->data + off%BSIZE, m);
    brelse(bp);
  }
  return n;
}// Write data to inode.
// Caller must hold ip->lock.
int
writei(struct inode *ip, char *src, uint, off, uint n) 
{
  uint tot, m;
  struct buf *bp;
  
  if(ip->type == T_DEV){
    if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].write)
      return -1;
    return devsw[ip->major].write(ip, src, n);
  }
  
  if(off > ip->size || off + n < off)
    return -1;
  if(off+n > MAXFILE*BSIZE)
    return -1;
  // write data to dst by BSIZE
  for(tot=0;tot<n;tot+=m,off+=m, dst+=m) {
    bp = bread(ip->dev, bmap(ip, off/BSIZE));
    m = min(n-tot, BSIZE - off%BSIZE);
    memmove(bp->data + off%BSIZE, src, m);
    // write to log header
    log_write(bp);
    brelse(bp);
  }
  // update inode size 
  if(n > 0 && off > ip->size) {
    ip->size = off;
    iupdate(ip);
  }
  return n;
}

bmap : Solve the issue how to get the blockno from inode given blockno in inode (fs.c)

static uint 
bmap(struct inode *ip, uint bn)
{
  uint addr, *a;
  struct buf *bp;
  // is direct address
  if(bn <NDIRECT) {
    if((addr = ip->addrs[bn]) == 0)
      ip->addrs[bn] = balloc(ip->dev);
    return addr
  }
  
  bn -= NDIRECT;
  // IDIRECT 0-127
  if(bn < NINDIRECT){
    // Load indirect block, allocating if necessary.
    if((addr = ip->addrs[NDIRECT]) == 0)
      ip->addrs[NDIRECT] = addr = balloc(ip->dev);
    bp = bread(ip->dev, addr);
    // indirect index block(0-127)
    a = (uint*) bp->data;
    if((addr = a[bn]) == 0){
      a[bn] = addr = balloc(ip-dev);
      log_write(bp);
    }
    brelse(bp);
    return addr;
  }
  
  panic("bmap: out of range");
}

file struct

// file.h
// a file can be a pipe or an inode
// each file has a current offset
struct file {
  enum {FD_NONE, FD_PIPE, FD_INODE} type;
  int ref; // reference count
  char readable;
  char writable;
  struct pipe *pipe;
  struct inode *ip;
  uint off;
}

file.c

// Read from file f
int
fileread(struct file *f, char *addr, int n)
{
  int r;
  if(f->readable == 0)
    return -1;
  if(f->type == FD_TYPE)
    return piperead(f->pipe, addr, n);
  if(f->type == FD_INODE){
    ilock(f->ip);
    if((r = readi(f->ip, addr, f->off, n)) > 0)
      f->off += r;
    iunlock(f->ip);
    return r;
  }
  panic("fileread");
}

The last in this inode section is that I want to illustrate more about how to allocate a zero disk block for a certain device (balloc fs.c)

static uint
balloc(uint dev)
{
  int b, bi, m;
  struct buf *bp;
  
  bp = 0;
  // BPB is Bitmap bits per block =>BSIZE * 8  for(b=0; b< sb.size;b+=BPB) {
    // Block of free map containing bit for block b
    // #define BBLOCK(b, sb) (b/BPB + sb.bmapstart)
    bp = bread(dev, BBLOBK(b, sb));
    
    //Check every bit (bi) of a block
    for(bi=0;bi<BPB && b+bi<sb.size;bi++){
      m = 1<<(bi%8);
      if((bp->data[bi/8] & m) == 0){ // Is block free?
        bp->data[bi/8] |= m; // Mark block in use.
        log_write(bp);
        brelse(bp);
        bzero(dev, b+bi);
        return b+bi;
      }
    }
    brelse(bp);
  }
  panic("balloc: out of blocks");
}

Directory Inodes (Directory)

Purpose: Directories are special inodes. Those inodes contain names of directories and a pointer to an unnamed inode.

Dirname is max of 14 chars
Has a special inode type T_DIR

struct dirent (fs.h)

#define DIRSIZE 14
struct dirent {
  ushort inum;
  char name[DIRSIZ];
};

Interfaces of Directory Inode

dirlookup: Search for a directory given directory name
dirlink: Add a new file to a directory

Now that us talk more about implementation of the interface

// Look for a directory entry in a directory.
// If found, set *poff to byte offset of entry.
struct inode*
dirlookup(struct inode *dp, char *name, uint *poff)
{
  uint off, inum;
  struct dirent de;
  // loop inode to find the matched directory entry
  for(off=0;off<dp->size;off+=sizeof(de)) {
    if(readi(dp, (char *)&de, off, sizeof(de)) != sizeof(de))
      panic("dirlookup read");
    if(de.inum == 0)
      continue;
    // compare name is matched
    if(namecmp(name, de.name) == 0){
      if(poff)
        *poff = off;
      inum = de.inum;
      return iget(dp->dev, inum); // get inode
    }
  }
  return 0;
}

Recursive Lookup (Pathname)

Purpose: Hierarchical path like /usr/bin/sh that can look up recursively by ‘/’

Interface

namei: Resolve a path into inode

> if started with ‘/’, that means resolve from root path

> otherwise, start from current dir

namei (fs.c)

struct inode*
namei(char *path)
{
  char name[DIRSIZ];
  return namex(path, 0, name);
}// Look up and return the inode for a path name.
// If parent != 0, return the inode for the parent and copy the final
// path element into name, which must have room for DIRSIZ bytes.
// Must be called inside a transaction since it calls iput().
static struct inode*
namex(char *path, int nameiparent, char *name)
{
  struct inode *ip, *next;
  
  // root path or cwd
  if(*path == '/')
    ip = iget(ROOTDEV, ROOTINO);
  else
    ip = idup(myproc()->cwd);
  // skipelem("a/bb/c", name) = "bb/c", setting name = "a"
  while((path = skipelem(path, name)) != 0) {
    ilock(ip);
    if(ip->type != T_DIR) {
      iunlockput(ip);
      return 0;
    }
    if(nameiparent && *path == '\0') {
      // Stop one level early.
      iunlock(ip);
      return ip;
    }
    // dir not found
    if((next=dirlookup(ip, name, 0)) == 0){
      iunlockput(ip);
      return 0;
    }
    iunlockput(ip);
    ip = next;
  }
  if(nameiparent){
    iput(ip);
    return 0;
  }
  return ip;
}

File Descriptors (System Call)

Purpose: File descriptors are abstract resources as Files like Files, Sockets, Devices, Pipes

struct of file

// proc.h
struct proc {
  ...
  // Open files
  struct file *ofile[NOFILE];
  ...
}

Interface of File Descriptor

fdalloc : Allocate a file descriptor for a given file

// sysfile.c
static int
fdalloc(struct file *f)
{
  int fd;
  struct proc *curproc = mycpu();
  // NOFILE is max files open for a process
  for(fd=0;fd<NOFILE;fd++){
    if(curproc->ofile[fd] == 0) {
      curproc->ofile[fd] = f;
      return fd;
    }
  }
  return -1;
}

Reference

brilliant OS course by Anton Burtsev in UC Irvine