Main ideas for this post
- File System Architecture
- Crash Recovery
- Cache Mechanism
File System Architecture
layout of File System
Notice: Block 0, 1, 2 are fixed
Block 0: Boot code
Block 1: Super Block. Store metadata about the file system
- Size (# of the blocks)
- # of data blocks
- # of inodes
- # of blocks in log
- Besides, super block also fills in by a small program called mkfs (mkfs.c) which is an initial file system
Block 2: Log area. Use for transactions. Maintain consistency in case of a power outrage or system shutdown accidentally
Inodes: Unnamed files
Bitmap: An area to check which blocks are in use
Data area: Actual data located
Bottom up Approach
- File descriptors
- Recursive lookup
- Directory Inodes
- Inodes and Block allocator
- Logging
- Buffer Cache
- Disk
Disk
Actual Disk
Buffer Cache (Block)
Purpose: Read & Write date from a block device
Goals:
- Synchronizations :
- Synchronize disk blocks and ensure only one copy of a data block exist in the kernel
- Make sure only one writer(kernel process) updates the copy at a time
- Caching:
- Cache frequently used data in buffer for efficiency
Buffer definition (buf.h)
struct buf {
int flags; // buf status
uint dev; // device flag, we may have multiple devices
uint blockno; // block no on disk
struct sleeplock lock; // a lock for sleeplock
struct buf *prev; // LRU cache list
struct buf *next;
struct buf *qnext; // disk queue
uchar data[BSIZE]; // size of one buf is 512KB};// buf status
#define B_VALID 0x2 // buffer has been read from disk
#define B_DIRTY 0x4 // buffer needs to be written to disk
Buffer cache definition (bio.c)
Notice: bcache.head is a dummy head the bcache.head.next is the actual most recent used buf
// the buffer cache structure
struct {
struct spinlock lock;
struct buf buf[NBUF]; // NBUF : MAXOPBLOCKS*3 size of the disk block cache , MAXOPBLOCKS = 10 : max # of blocks any FS op writes // Linked list of all buffers, through prev/next.
// head.next is most recently used.
struct buf head;
} bcache;
Interfaces of buffer cache
- bget: find an available buffer in cache line
- bread: obtain a copy of a disk block for reading
- bwrite: write a modified copy into a block
- brelease: only one thread holds access and the others are blocked and waited until the the current thread released (brelease)
Code Snippets
// Return a locked buf with the contents of the indicated block.
struct buf*
bread(uint dev, uint blockno)
{
struct buf *b;
b = bget(dev, blockno);
// not valid
if( (b->flags & B_VALID) == 0 ) {
// need to read from disk
iderw(b);
}
return b;
}// write b's content to disk. Must be locked !!
void
bwrite(struct buf *b)
{
// must hold the sleeplock
if(!holdingsleep(&b->lock))
panic("bwrite");
b->flags = |= B_DIRTY;
iderw(b);
}// Look through buffer cache for block on device dev.
// If not found, allocate a buffer.
// In either case, return locked buffer.
static struct buf*
bget(uint dev, uint blockno)
{
struct buf *b;
acquire(&bcache.lock);
// Is the block really cached?
for(b = bcache.head.next; b != &bcache.head; b=b->next){
if(b->dev == dev && b->blockno == blockno) {
b->refcnt++; //referenced counter ++
release(&bcache.lock);
acquiresleep(&b.lock);
// return as a static buf ptr
return b;
}
}
// Not cached; recycle an unused buffer.
// Even if refcnt==0, B_DIRTY indicates a buffer is in use
// because log.c has modified it but not yet committed it.
// LRU from last to head
for(b=bcache.head.prev; b != &bcache.head; b=b.prev) {
if(b->refcnt==0 && (b->flags & B_DIRTY)==0) {
b->dev = dev;
b->blockno = blockno;
b->flags = 0;
b->refcnt = 1;
release(&bcache.lock);
acquiresleep(&b->lock);
return b;
}
}
panic("bget: no buffers");}// Release a locked buffer.
// Move to the head of the MRU list
void
brelse(stuct buf *b)
{
if(!holdingsleep(&b->lock))
panic("brelse");
releasesleep(&b->lock);
acquire(&bcache.lock);
b->refcnt--;
if(b->refcnt == 0) {
// no one is waiting for it.
// linked list remove b
b->next->prev = b->prev;
b->prev->next = b->next;
// put to bcache.head.next
b->next = bcache.head.next;
b->prev = &bcache.head; // as the head of bcache
bcache.head.next->prev = b;
bcache.head.next = b;
}
release(&bcache.lock);
}
Example: install_trans(void) -> commit data from log to their home destination
static void
install_trans(void)
{
int tail;
// log.lh comes from read_head
// bread(log.dev, log.start);
// struct logheader *lh = (struct logheader *)(buf->data); for(tail = 0; tail < log.lh.n;tail++) {
struct buf *lbuf = bread(log.dev, log.start+tail+1); // read log block
struct buf *dbuf = bread(log.dev, log.lh.block[tail]); //read dest
memmove(dbuf->data, lbuf->data, BSIZE);
bwrite(dbuf); // write dst to disk
brelse(lbuf);
brelse(dbuf);
}
}
Logging (Transactions)
Purpose: Do transactions to achieve crash recovery. Transactions mean group multiple writes into one bundle
Goal: For Consistency
To be more specific, file system operations involve multiple writes to disk. Once the system crashes, it may leave the inconsistent state of the file system.
Ex: During file deletion, it can leave file system with
- Leave an allocated but unreferenced block
- Leave an inode referenced to a content block that is marked free
Solution
Main idea: Do not write to disk directly !
Instead,
- Logged data into journel
- Once all writes are logged, the system writes a special commit record
- Indicated that the log contains a complete transaction
3. Then, file system copies those logs to on disk location. (install_trans talked in last example)
- Erase data in log after complete
Recovery Methodology
- After rebooting, copy the log if the status is marked as complete
- Otherwise, discard all writes in log
Interfaces of Logging
- begin_op: called at the start of each FS system call
- log_write: caller modified buffer as B_DIRTY and data. Then, write the buffer’s block id into log header
- end_op: caller calls at the end of each FS system call. commits it if this is the last outstanding operation
- commit: process from cache->log, header to disk, log->disk
- write_log: write modified data from cache to log
- write_head: write log header’s block num info from log header to disk
- install_trans: write data from log to to blockno in disk
Code Snippet (log.c)
// used structs
struct logheader {
int n; // # of the logged block
int block[LOGSIZE]; # the logged block's blockno
}struct log {
struct spinlock lock;
int start;
int size;
int outstanding; // how many FS sys calls are executing.
int committing; // in commit(), please wait.
int dev;
struct logheader lh;
}struct log log;// called at the start of each FS system call.
void
begin_op(void)
{
acquire(&log.lock);
while(1) {
if(log.committing) {
sleep(&log, &log.lock);
} else if(log.lh.n + (log.outstanding+1)*MAXOPBLOCKS > LOGSIZE) {
// this op might exhaust log space; wait for commit.
sleep(&log, &log.lock);
} else {
log.outstanding += 1;
release(&log.lock);
break;
}
}// Caller has modified b->data and is done with the buffer.
// Record the block number and pin in the cache with B_DIRTY.
// commit()/write_log() will do the disk write.
//
// log_write() replaces bwrite(); a typical use is:
// bp = bread(...)
// modify bp->data[]
// log_write(bp)
// brelse(bp)void
log_write(struct buf *b)
{
int i;
if(log.lh.n >= LOGSIZE || log.lh.n >= log.size - 1)
panic("too big a transaction");
if(log.outstanding < 1)
panic("log_write outside of trans");
acquire(&log.lock);
for(i=0; i<log.lh.n;i++) {
if(log.lh.block[i] == b->blockno) //log absorbtion
break;
}
log.lh.block[i] = b->blockno;
if(i == log.lh.n)
log.lh.n++ // increase logged block size
b->flags |= B_DIRTY;
release(&log.lock);
}// called at the end of each FS system call.
// commits if this was the last outstanding operation.
void
end_op(void) {
int do_commit = 0;
acquire(&log.lock);
log.outstanding -= 1;
if(log.committing)
panic("log committing");
if(log.outstanding == 0) {
do_commit = 1; // no anyone do transaction
log.committing = 1;
} else {
/ begin_op() may be waiting for log space,
// and decrementing log.outstanding has decreased
// the amount of reserved space.
wakeup(&log);
}
release(&log.lock);
if(do_commit){
// call commit w/o holding locks, since not allowed
// to sleep with locks.
commit();
acquire(&lock.lock);
log.committing = 0;
wakeup(&log);
release(&log.lock);
}
}
more on code snippets (Commit)
Notice: log.start+tail+1 : plus one is for log header
static void
commit()
{
if(log.lh.n > 0) {
write_log(); // Write modified blocks from cache to log
write_head(); // Write header to disk -- the real commit
install_trans(); // Now install writes to home locations
log.lh.n = 0;
write_head(); // Erase the transaction from the log
}
}// Copy modified blocks from cache to log
static void
write_log(void)
{
int tail;
// loop the log
for(tail=0;tail<log.n.h;tail++) {
struct buf *to = bread(log.dev, log.start+tail+1); // log block
struct buf *from = bread(log.dev,log.lh.block[tail]); // cache block
memove(to->data, from->data, BSIZE);
bwrite(to);
brelse(from);
brelse(to);
}
}// Write in-memory log header to disk.
// This is the true point at which the
// current transaction commits.
static void
write_head(void)
{
struct buf *buf = bread(log.dev, log.start);
struct logheader *hb = (struct logheader *) (buf->data);
int i;
hb->n = log.lh.n;
// put logheader's block into disk
for(i=0;i<hb->n;i++) {
hb->block[i] = log.lh.block[i];
} bwrite(buf);
brelse(buf);
}static void
install_trans(void)
{
int tail;
// log.lh comes from read_head
// bread(log.dev, log.start);
// struct logheader *lh = (struct logheader *)(buf->data);for(tail = 0; tail < log.lh.n;tail++) {
struct buf *lbuf = bread(log.dev, log.start+tail+1); // read log block
struct buf *dbuf = bread(log.dev, log.lh.block[tail]); //read dest
memmove(dbuf->data, lbuf->data, BSIZE);
bwrite(dbuf); // write dst to disk
brelse(lbuf);
brelse(dbuf);
}
}
To be mention, the brief process of transactions is:
- log_write: writes modified blocks’ blockno into log header
2. write_log: write modified data in cache into log
3. write_head: write log header’s blockno info into disk
4. install_trans: write log data into real dest
5. update there is no block waits for transaction :
- log.lh.n = 0
- log.committing = 0
File Inodes (Files)
Purpose: Unnamed files represent as Inodes. Usually, inodes are consisted of several blocks
Inode
- On disk: Holds metadata -> File type, size, # of links referring to it, list of blocks with data
- In memory: A copy of an on-disk inode + some additional kernel information
-> Reference counter: ip->ref
- > Synchronization Flag: ip->flags
- Inodes stores as an array on disk
- point by sb.startinode
2. Each inodes has a number representing the position on disk
3. The kernel keeps a cache of inode in memory (Synchronization issue)
struct dinode (fs.h)
Notice: addrs[NDIRECT+1] because there is one indirect entry in a inode
// On-disk inode structure
struct dinode {
short type; // File type
short major; // Major device number (T_DEV only)
short minor; // Minor device number (T_DEV only)
short nlink; // Number of links to inode in file system
uint size; // Size of file (bytes)
uint addrs[NDIRECT+1]; // Data block addresses
};
Calculate max size of inode structure
One layer Indirect inode example:
BSIZE = 512B
Direct address Number = 12
Direct data size = 12*512B
Indirect block has 512B/4B = 128 entries
Each entry point to 512B data
That is indirect data size is 128*512B
Total size is 140*512B = 71680B
struct in-memory copy of inode (file.h)
// in-memory copy of an inode
struct inode {
// more info for kernel
uint dev; // Device number
uint inum; // Inode number
int ref; // Reference count
struct sleeplock lock; // protects everything below here
int valid; // inode has been read from disk? short type; // copy of disk inode
short major;
short minor;
short nlink;
uint size;
uint addrs[NDIRECT+1];
};
struct of icache(fs.c)
struct {
struct spinlock lock;
struct inode inode[NINODE]; // NINODE = 50 in param.h
} icache;
Inode Life Cycle
- Allocation on disk
- ialloc()
- iput() // dealloc
- Referencing
- ip->ref tracks # of active pointers to an inode in memory
Typical accessing inode example in Xv6 source code
ip = iget(dev, inum);
ilock(ip);
... examine and modify ip->xxx ...
iunlock(ip);
iput(ip);
Interfaces of Inode
- iget: find the inode with number inum on device dev and return an in-memory copy (not copy the data already, just do some kernel info setup)
- readi: read inode
- writei: write inode
// Find the inode with number inum on device dev
// and return the in-memory copy. Does not lock
// the inode and does not read it from disk.
static struct inode*
iget(uint dev, uint inum)
{
struct inode *ip, *empty;
acquire(&icache->lock);
// Is the inode already cached?
for(ip=&icache.inode[0]; i<&inoce.inode[NINODE];ip++) {
if(ip->ref > 0 && ip->dev == dev && ip->num == inum) {
ip->ref++;
release(&icache->lock);
return ip;
}
if(empty == 0 && ip->ref == 0)
empty = ip; // Remember empty slot.
}
if(empty == 0)
panic("iget: no inodes");
ip = empty;
ip->dev = dev;
ip->inum = inum;
ip->ref = 1;
ip->valid = 0;
release(&icache.lock);
return ip;
}// Read data from inode.
// Caller must hold ip->lock.
int
readi(struct inode *ip, char *dst, uint off, uint n)
{
uint tot, m;
struct buf *bp;
if(ip->type == T_DEV){
if(ip->major < 0 || ip->major >= NDEV || devsw[ip->major].read)
return -1;
return devsw[ip->major].read(ip, dst, n);
}
// offset can not over inode size or overflow
if(off > ip->size || off+n < off)
return -1;
// trim read size because n over the limit
if(off+n > ip->size)
n = ip->size - off;// for loop to read from block
for(tot=0;tot<n;tot+=m,off+=m, dst+=m) {
// bmap Return the disk block address of the nth block in inode ip which is the blockno
bp = bread(ip->dev, bmap(ip, off/BSIZE));
// get block size or if is in the last only pick up the rest
// also should not include offset
m = min(n-tot, BSIZE -off%BSIZE);
memmove(dst, bp->data + off%BSIZE, m);
brelse(bp);
}
return n;
}// Write data to inode.
// Caller must hold ip->lock.
int
writei(struct inode *ip, char *src, uint, off, uint n)
{
uint tot, m;
struct buf *bp;
if(ip->type == T_DEV){
if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].write)
return -1;
return devsw[ip->major].write(ip, src, n);
}
if(off > ip->size || off + n < off)
return -1;
if(off+n > MAXFILE*BSIZE)
return -1;
// write data to dst by BSIZE
for(tot=0;tot<n;tot+=m,off+=m, dst+=m) {
bp = bread(ip->dev, bmap(ip, off/BSIZE));
m = min(n-tot, BSIZE - off%BSIZE);
memmove(bp->data + off%BSIZE, src, m);
// write to log header
log_write(bp);
brelse(bp);
}
// update inode size
if(n > 0 && off > ip->size) {
ip->size = off;
iupdate(ip);
}
return n;
}
bmap : Solve the issue how to get the blockno from inode given blockno in inode (fs.c)
static uint
bmap(struct inode *ip, uint bn)
{
uint addr, *a;
struct buf *bp;
// is direct address
if(bn <NDIRECT) {
if((addr = ip->addrs[bn]) == 0)
ip->addrs[bn] = balloc(ip->dev);
return addr
}
bn -= NDIRECT;
// IDIRECT 0-127
if(bn < NINDIRECT){
// Load indirect block, allocating if necessary.
if((addr = ip->addrs[NDIRECT]) == 0)
ip->addrs[NDIRECT] = addr = balloc(ip->dev);
bp = bread(ip->dev, addr);
// indirect index block(0-127)
a = (uint*) bp->data;
if((addr = a[bn]) == 0){
a[bn] = addr = balloc(ip-dev);
log_write(bp);
}
brelse(bp);
return addr;
}
panic("bmap: out of range");
}
file struct
// file.h
// a file can be a pipe or an inode
// each file has a current offset
struct file {
enum {FD_NONE, FD_PIPE, FD_INODE} type;
int ref; // reference count
char readable;
char writable;
struct pipe *pipe;
struct inode *ip;
uint off;
}
file.c
// Read from file f
int
fileread(struct file *f, char *addr, int n)
{
int r;
if(f->readable == 0)
return -1;
if(f->type == FD_TYPE)
return piperead(f->pipe, addr, n);
if(f->type == FD_INODE){
ilock(f->ip);
if((r = readi(f->ip, addr, f->off, n)) > 0)
f->off += r;
iunlock(f->ip);
return r;
}
panic("fileread");
}
The last in this inode section is that I want to illustrate more about how to allocate a zero disk block for a certain device (balloc fs.c)
static uint
balloc(uint dev)
{
int b, bi, m;
struct buf *bp;
bp = 0;
// BPB is Bitmap bits per block =>BSIZE * 8 for(b=0; b< sb.size;b+=BPB) {
// Block of free map containing bit for block b
// #define BBLOCK(b, sb) (b/BPB + sb.bmapstart)
bp = bread(dev, BBLOBK(b, sb));
//Check every bit (bi) of a block
for(bi=0;bi<BPB && b+bi<sb.size;bi++){
m = 1<<(bi%8);
if((bp->data[bi/8] & m) == 0){ // Is block free?
bp->data[bi/8] |= m; // Mark block in use.
log_write(bp);
brelse(bp);
bzero(dev, b+bi);
return b+bi;
}
}
brelse(bp);
}
panic("balloc: out of blocks");
}
Directory Inodes (Directory)
Purpose: Directories are special inodes. Those inodes contain names of directories and a pointer to an unnamed inode.
- Dirname is max of 14 chars
- Has a special inode type T_DIR
struct dirent (fs.h)
#define DIRSIZE 14
struct dirent {
ushort inum;
char name[DIRSIZ];
};
Interfaces of Directory Inode
- dirlookup: Search for a directory given directory name
- dirlink: Add a new file to a directory
Now that us talk more about implementation of the interface
// Look for a directory entry in a directory.
// If found, set *poff to byte offset of entry.
struct inode*
dirlookup(struct inode *dp, char *name, uint *poff)
{
uint off, inum;
struct dirent de;
// loop inode to find the matched directory entry
for(off=0;off<dp->size;off+=sizeof(de)) {
if(readi(dp, (char *)&de, off, sizeof(de)) != sizeof(de))
panic("dirlookup read");
if(de.inum == 0)
continue;
// compare name is matched
if(namecmp(name, de.name) == 0){
if(poff)
*poff = off;
inum = de.inum;
return iget(dp->dev, inum); // get inode
}
}
return 0;
}
Recursive Lookup (Pathname)
Purpose: Hierarchical path like /usr/bin/sh that can look up recursively by ‘/’
Interface
- namei: Resolve a path into inode
> if started with ‘/’, that means resolve from root path
> otherwise, start from current dir
namei (fs.c)
struct inode*
namei(char *path)
{
char name[DIRSIZ];
return namex(path, 0, name);
}// Look up and return the inode for a path name.
// If parent != 0, return the inode for the parent and copy the final
// path element into name, which must have room for DIRSIZ bytes.
// Must be called inside a transaction since it calls iput().
static struct inode*
namex(char *path, int nameiparent, char *name)
{
struct inode *ip, *next;
// root path or cwd
if(*path == '/')
ip = iget(ROOTDEV, ROOTINO);
else
ip = idup(myproc()->cwd);
// skipelem("a/bb/c", name) = "bb/c", setting name = "a"
while((path = skipelem(path, name)) != 0) {
ilock(ip);
if(ip->type != T_DIR) {
iunlockput(ip);
return 0;
}
if(nameiparent && *path == '\0') {
// Stop one level early.
iunlock(ip);
return ip;
}
// dir not found
if((next=dirlookup(ip, name, 0)) == 0){
iunlockput(ip);
return 0;
}
iunlockput(ip);
ip = next;
}
if(nameiparent){
iput(ip);
return 0;
}
return ip;
}
File Descriptors (System Call)
Purpose: File descriptors are abstract resources as Files like Files, Sockets, Devices, Pipes
struct of file
// proc.h
struct proc {
...
// Open files
struct file *ofile[NOFILE];
...
}
Interface of File Descriptor
- fdalloc : Allocate a file descriptor for a given file
// sysfile.c
static int
fdalloc(struct file *f)
{
int fd;
struct proc *curproc = mycpu();
// NOFILE is max files open for a process
for(fd=0;fd<NOFILE;fd++){
if(curproc->ofile[fd] == 0) {
curproc->ofile[fd] = f;
return fd;
}
}
return -1;
}
Reference
- brilliant OS course by Anton Burtsev in UC Irvine