1// File system implementation. Five layers:
2// + Blocks: allocator for raw disk blocks.
3// + Log: crash recovery for multi-step updates.
4// + Files: inode allocator, reading, writing, metadata.
5// + Directories: inode with special contents (list of other inodes!)
6// + Names: paths like /usr/rtm/xv6/fs.c for convenient naming.
7//
8// This file contains the low-level file system manipulation
9// routines. The (higher-level) system call implementations
10// are in sysfile.c.
11
12#include "types.h"
13#include "defs.h"
14#include "param.h"
15#include "stat.h"
16#include "mmu.h"
17#include "proc.h"
18#include "spinlock.h"
19#include "sleeplock.h"
20#include "fs.h"
21#include "buf.h"
22#include "file.h"
23
24#define min(a, b) ((a) < (b) ? (a) : (b))
25static void itrunc(struct inode*);
26// there should be one superblock per disk device, but we run with
27// only one device
28struct superblock sb;
29
30// Read the super block.
31void
32readsb(int dev, struct superblock *sb)
33{
34 struct buf *bp;
35
36 bp = bread(dev, 1);
37 memmove(sb, bp->data, sizeof(*sb));
38 brelse(bp);
39}
40
41// Zero a block.
42static void
43bzero(int dev, int bno)
44{
45 struct buf *bp;
46
47 bp = bread(dev, bno);
48 memset(bp->data, 0, BSIZE);
49 log_write(bp);
50 brelse(bp);
51}
52
53// Blocks.
54
55// Allocate a zeroed disk block.
56static uint
57balloc(uint dev)
58{
59 int b, bi, m;
60 struct buf *bp;
61
62 bp = 0;
63 for(b = 0; b < sb.size; b += BPB){
64 bp = bread(dev, BBLOCK(b, sb));
65 for(bi = 0; bi < BPB && b + bi < sb.size; bi++){
66 m = 1 << (bi % 8);
67 if((bp->data[bi/8] & m) == 0){ // Is block free?
68 bp->data[bi/8] |= m; // Mark block in use.
69 log_write(bp);
70 brelse(bp);
71 bzero(dev, b + bi);
72 return b + bi;
73 }
74 }
75 brelse(bp);
76 }
77 panic("balloc: out of blocks");
78}
79
80// Free a disk block.
81static void
82bfree(int dev, uint b)
83{
84 struct buf *bp;
85 int bi, m;
86
87 readsb(dev, &sb);
88 bp = bread(dev, BBLOCK(b, sb));
89 bi = b % BPB;
90 m = 1 << (bi % 8);
91 if((bp->data[bi/8] & m) == 0)
92 panic("freeing free block");
93 bp->data[bi/8] &= ~m;
94 log_write(bp);
95 brelse(bp);
96}
97
98// Inodes.
99//
100// An inode describes a single unnamed file.
101// The inode disk structure holds metadata: the file's type,
102// its size, the number of links referring to it, and the
103// list of blocks holding the file's content.
104//
105// The inodes are laid out sequentially on disk at
106// sb.startinode. Each inode has a number, indicating its
107// position on the disk.
108//
109// The kernel keeps a cache of in-use inodes in memory
110// to provide a place for synchronizing access
111// to inodes used by multiple processes. The cached
112// inodes include book-keeping information that is
113// not stored on disk: ip->ref and ip->valid.
114//
115// An inode and its in-memory representation go through a
116// sequence of states before they can be used by the
117// rest of the file system code.
118//
119// * Allocation: an inode is allocated if its type (on disk)
120// is non-zero. ialloc() allocates, and iput() frees if
121// the reference and link counts have fallen to zero.
122//
123// * Referencing in cache: an entry in the inode cache
124// is free if ip->ref is zero. Otherwise ip->ref tracks
125// the number of in-memory pointers to the entry (open
126// files and current directories). iget() finds or
127// creates a cache entry and increments its ref; iput()
128// decrements ref.
129//
130// * Valid: the information (type, size, &c) in an inode
131// cache entry is only correct when ip->valid is 1.
132// ilock() reads the inode from
133// the disk and sets ip->valid, while iput() clears
134// ip->valid if ip->ref has fallen to zero.
135//
136// * Locked: file system code may only examine and modify
137// the information in an inode and its content if it
138// has first locked the inode.
139//
140// Thus a typical sequence is:
141// ip = iget(dev, inum)
142// ilock(ip)
143// ... examine and modify ip->xxx ...
144// iunlock(ip)
145// iput(ip)
146//
147// ilock() is separate from iget() so that system calls can
148// get a long-term reference to an inode (as for an open file)
149// and only lock it for short periods (e.g., in read()).
150// The separation also helps avoid deadlock and races during
151// pathname lookup. iget() increments ip->ref so that the inode
152// stays cached and pointers to it remain valid.
153//
154// Many internal file system functions expect the caller to
155// have locked the inodes involved; this lets callers create
156// multi-step atomic operations.
157//
158// The icache.lock spin-lock protects the allocation of icache
159// entries. Since ip->ref indicates whether an entry is free,
160// and ip->dev and ip->inum indicate which i-node an entry
161// holds, one must hold icache.lock while using any of those fields.
162//
163// An ip->lock sleep-lock protects all ip-> fields other than ref,
164// dev, and inum. One must hold ip->lock in order to
165// read or write that inode's ip->valid, ip->size, ip->type, &c.
166
167struct {
168 struct spinlock lock;
169 struct inode inode[NINODE];
170} icache;
171
172void
173iinit(int dev)
174{
175 int i = 0;
176
177 initlock(&icache.lock, "icache");
178 for(i = 0; i < NINODE; i++) {
179 initsleeplock(&icache.inode[i].lock, "inode");
180 }
181
182 readsb(dev, &sb);
183 cprintf("sb: size %d nblocks %d ninodes %d nlog %d logstart %d\
184 inodestart %d bmap start %d\n", sb.size, sb.nblocks,
185 sb.ninodes, sb.nlog, sb.logstart, sb.inodestart,
186 sb.bmapstart);
187}
188
189static struct inode* iget(uint dev, uint inum);
190
191//PAGEBREAK!
192// Allocate an inode on device dev.
193// Mark it as allocated by giving it type type.
194// Returns an unlocked but allocated and referenced inode.
195struct inode*
196ialloc(uint dev, short type)
197{
198 int inum;
199 struct buf *bp;
200 struct dinode *dip;
201
202 for(inum = 1; inum < sb.ninodes; inum++){
203 bp = bread(dev, IBLOCK(inum, sb));
204 dip = (struct dinode*)bp->data + inum%IPB;
205 if(dip->type == 0){ // a free inode
206 memset(dip, 0, sizeof(*dip));
207 dip->type = type;
208 log_write(bp); // mark it allocated on the disk
209 brelse(bp);
210 return iget(dev, inum);
211 }
212 brelse(bp);
213 }
214 panic("ialloc: no inodes");
215}
216
217// Copy a modified in-memory inode to disk.
218// Must be called after every change to an ip->xxx field
219// that lives on disk, since i-node cache is write-through.
220// Caller must hold ip->lock.
221void
222iupdate(struct inode *ip)
223{
224 struct buf *bp;
225 struct dinode *dip;
226
227 bp = bread(ip->dev, IBLOCK(ip->inum, sb));
228 dip = (struct dinode*)bp->data + ip->inum%IPB;
229 dip->type = ip->type;
230 dip->major = ip->major;
231 dip->minor = ip->minor;
232 dip->nlink = ip->nlink;
233 dip->size = ip->size;
234 memmove(dip->addrs, ip->addrs, sizeof(ip->addrs));
235 log_write(bp);
236 brelse(bp);
237}
238
239// Find the inode with number inum on device dev
240// and return the in-memory copy. Does not lock
241// the inode and does not read it from disk.
242static struct inode*
243iget(uint dev, uint inum)
244{
245 struct inode *ip, *empty;
246
247 acquire(&icache.lock);
248
249 // Is the inode already cached?
250 empty = 0;
251 for(ip = &icache.inode[0]; ip < &icache.inode[NINODE]; ip++){
252 if(ip->ref > 0 && ip->dev == dev && ip->inum == inum){
253 ip->ref++;
254 release(&icache.lock);
255 return ip;
256 }
257 if(empty == 0 && ip->ref == 0) // Remember empty slot.
258 empty = ip;
259 }
260
261 // Recycle an inode cache entry.
262 if(empty == 0)
263 panic("iget: no inodes");
264
265 ip = empty;
266 ip->dev = dev;
267 ip->inum = inum;
268 ip->ref = 1;
269 ip->valid = 0;
270 release(&icache.lock);
271
272 return ip;
273}
274
275// Increment reference count for ip.
276// Returns ip to enable ip = idup(ip1) idiom.
277struct inode*
278idup(struct inode *ip)
279{
280 acquire(&icache.lock);
281 ip->ref++;
282 release(&icache.lock);
283 return ip;
284}
285
286// Lock the given inode.
287// Reads the inode from disk if necessary.
288void
289ilock(struct inode *ip)
290{
291 struct buf *bp;
292 struct dinode *dip;
293
294 if(ip == 0 || ip->ref < 1)
295 panic("ilock");
296
297 acquiresleep(&ip->lock);
298
299 if(ip->valid == 0){
300 bp = bread(ip->dev, IBLOCK(ip->inum, sb));
301 dip = (struct dinode*)bp->data + ip->inum%IPB;
302 ip->type = dip->type;
303 ip->major = dip->major;
304 ip->minor = dip->minor;
305 ip->nlink = dip->nlink;
306 ip->size = dip->size;
307 memmove(ip->addrs, dip->addrs, sizeof(ip->addrs));
308 brelse(bp);
309 ip->valid = 1;
310 if(ip->type == 0)
311 panic("ilock: no type");
312 }
313}
314
315// Unlock the given inode.
316void
317iunlock(struct inode *ip)
318{
319 if(ip == 0 || !holdingsleep(&ip->lock) || ip->ref < 1)
320 panic("iunlock");
321
322 releasesleep(&ip->lock);
323}
324
325// Drop a reference to an in-memory inode.
326// If that was the last reference, the inode cache entry can
327// be recycled.
328// If that was the last reference and the inode has no links
329// to it, free the inode (and its content) on disk.
330// All calls to iput() must be inside a transaction in
331// case it has to free the inode.
332void
333iput(struct inode *ip)
334{
335 acquiresleep(&ip->lock);
336 if(ip->valid && ip->nlink == 0){
337 acquire(&icache.lock);
338 int r = ip->ref;
339 release(&icache.lock);
340 if(r == 1){
341 // inode has no links and no other references: truncate and free.
342 itrunc(ip);
343 ip->type = 0;
344 iupdate(ip);
345 ip->valid = 0;
346 }
347 }
348 releasesleep(&ip->lock);
349
350 acquire(&icache.lock);
351 ip->ref--;
352 release(&icache.lock);
353}
354
355// Common idiom: unlock, then put.
356void
357iunlockput(struct inode *ip)
358{
359 iunlock(ip);
360 iput(ip);
361}
362
363//PAGEBREAK!
364// Inode content
365//
366// The content (data) associated with each inode is stored
367// in blocks on the disk. The first NDIRECT block numbers
368// are listed in ip->addrs[]. The next NINDIRECT blocks are
369// listed in block ip->addrs[NDIRECT].
370
371// Return the disk block address of the nth block in inode ip.
372// If there is no such block, bmap allocates one.
373static uint
374bmap(struct inode *ip, uint bn)
375{
376 uint addr, *a;
377 struct buf *bp;
378
379 if(bn < NDIRECT){
380 if((addr = ip->addrs[bn]) == 0)
381 ip->addrs[bn] = addr = balloc(ip->dev);
382 return addr;
383 }
384 bn -= NDIRECT;
385
386 if(bn < NINDIRECT){
387 // Load indirect block, allocating if necessary.
388 if((addr = ip->addrs[NDIRECT]) == 0)
389 ip->addrs[NDIRECT] = addr = balloc(ip->dev);
390 bp = bread(ip->dev, addr);
391 a = (uint*)bp->data;
392 if((addr = a[bn]) == 0){
393 a[bn] = addr = balloc(ip->dev);
394 log_write(bp);
395 }
396 brelse(bp);
397 return addr;
398 }
399
400 panic("bmap: out of range");
401}
402
403// Truncate inode (discard contents).
404// Only called when the inode has no links
405// to it (no directory entries referring to it)
406// and has no in-memory reference to it (is
407// not an open file or current directory).
408static void
409itrunc(struct inode *ip)
410{
411 int i, j;
412 struct buf *bp;
413 uint *a;
414
415 for(i = 0; i < NDIRECT; i++){
416 if(ip->addrs[i]){
417 bfree(ip->dev, ip->addrs[i]);
418 ip->addrs[i] = 0;
419 }
420 }
421
422 if(ip->addrs[NDIRECT]){
423 bp = bread(ip->dev, ip->addrs[NDIRECT]);
424 a = (uint*)bp->data;
425 for(j = 0; j < NINDIRECT; j++){
426 if(a[j])
427 bfree(ip->dev, a[j]);
428 }
429 brelse(bp);
430 bfree(ip->dev, ip->addrs[NDIRECT]);
431 ip->addrs[NDIRECT] = 0;
432 }
433
434 ip->size = 0;
435 iupdate(ip);
436}
437
438// Copy stat information from inode.
439// Caller must hold ip->lock.
440void
441stati(struct inode *ip, struct stat *st)
442{
443 st->dev = ip->dev;
444 st->ino = ip->inum;
445 st->type = ip->type;
446 st->nlink = ip->nlink;
447 st->size = ip->size;
448}
449
450//PAGEBREAK!
451// Read data from inode.
452// Caller must hold ip->lock.
453int
454readi(struct inode *ip, char *dst, uint off, uint n)
455{
456 uint tot, m;
457 struct buf *bp;
458
459 if(ip->type == T_DEV){
460 if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].read)
461 return -1;
462 return devsw[ip->major].read(ip, dst, n);
463 }
464
465 if(off > ip->size || off + n < off)
466 return -1;
467 if(off + n > ip->size)
468 n = ip->size - off;
469
470 for(tot=0; tot<n; tot+=m, off+=m, dst+=m){
471 bp = bread(ip->dev, bmap(ip, off/BSIZE));
472 m = min(n - tot, BSIZE - off%BSIZE);
473 memmove(dst, bp->data + off%BSIZE, m);
474 brelse(bp);
475 }
476 return n;
477}
478
479// PAGEBREAK!
480// Write data to inode.
481// Caller must hold ip->lock.
482int
483writei(struct inode *ip, char *src, uint off, uint n)
484{
485 uint tot, m;
486 struct buf *bp;
487
488 if(ip->type == T_DEV){
489 if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].write)
490 return -1;
491 return devsw[ip->major].write(ip, src, n);
492 }
493
494 if(off > ip->size || off + n < off)
495 return -1;
496 if(off + n > MAXFILE*BSIZE)
497 return -1;
498
499 for(tot=0; tot<n; tot+=m, off+=m, src+=m){
500 bp = bread(ip->dev, bmap(ip, off/BSIZE));
501 m = min(n - tot, BSIZE - off%BSIZE);
502 memmove(bp->data + off%BSIZE, src, m);
503 log_write(bp);
504 brelse(bp);
505 }
506
507 if(n > 0 && off > ip->size){
508 ip->size = off;
509 iupdate(ip);
510 }
511 return n;
512}
513
514//PAGEBREAK!
515// Directories
516
517int
518namecmp(const char *s, const char *t)
519{
520 return strncmp(s, t, DIRSIZ);
521}
522
523// Look for a directory entry in a directory.
524// If found, set *poff to byte offset of entry.
525struct inode*
526dirlookup(struct inode *dp, char *name, uint *poff)
527{
528 uint off, inum;
529 struct dirent de;
530
531 if(dp->type != T_DIR)
532 panic("dirlookup not DIR");
533
534 for(off = 0; off < dp->size; off += sizeof(de)){
535 if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
536 panic("dirlookup read");
537 if(de.inum == 0)
538 continue;
539 if(namecmp(name, de.name) == 0){
540 // entry matches path element
541 if(poff)
542 *poff = off;
543 inum = de.inum;
544 return iget(dp->dev, inum);
545 }
546 }
547
548 return 0;
549}
550
551// Write a new directory entry (name, inum) into the directory dp.
552int
553dirlink(struct inode *dp, char *name, uint inum)
554{
555 int off;
556 struct dirent de;
557 struct inode *ip;
558
559 // Check that name is not present.
560 if((ip = dirlookup(dp, name, 0)) != 0){
561 iput(ip);
562 return -1;
563 }
564
565 // Look for an empty dirent.
566 for(off = 0; off < dp->size; off += sizeof(de)){
567 if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
568 panic("dirlink read");
569 if(de.inum == 0)
570 break;
571 }
572
573 strncpy(de.name, name, DIRSIZ);
574 de.inum = inum;
575 if(writei(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
576 panic("dirlink");
577
578 return 0;
579}
580
581//PAGEBREAK!
582// Paths
583
584// Copy the next path element from path into name.
585// Return a pointer to the element following the copied one.
586// The returned path has no leading slashes,
587// so the caller can check *path=='\0' to see if the name is the last one.
588// If no name to remove, return 0.
589//
590// Examples:
591// skipelem("a/bb/c", name) = "bb/c", setting name = "a"
592// skipelem("///a//bb", name) = "bb", setting name = "a"
593// skipelem("a", name) = "", setting name = "a"
594// skipelem("", name) = skipelem("////", name) = 0
595//
596static char*
597skipelem(char *path, char *name)
598{
599 char *s;
600 int len;
601
602 while(*path == '/')
603 path++;
604 if(*path == 0)
605 return 0;
606 s = path;
607 while(*path != '/' && *path != 0)
608 path++;
609 len = path - s;
610 if(len >= DIRSIZ)
611 memmove(name, s, DIRSIZ);
612 else {
613 memmove(name, s, len);
614 name[len] = 0;
615 }
616 while(*path == '/')
617 path++;
618 return path;
619}
620
621// Look up and return the inode for a path name.
622// If parent != 0, return the inode for the parent and copy the final
623// path element into name, which must have room for DIRSIZ bytes.
624// Must be called inside a transaction since it calls iput().
625static struct inode*
626namex(char *path, int nameiparent, char *name)
627{
628 struct inode *ip, *next;
629
630 if(*path == '/')
631 ip = iget(ROOTDEV, ROOTINO);
632 else
633 ip = idup(myproc()->cwd);
634
635 while((path = skipelem(path, name)) != 0){
636 ilock(ip);
637 if(ip->type != T_DIR){
638 iunlockput(ip);
639 return 0;
640 }
641 if(nameiparent && *path == '\0'){
642 // Stop one level early.
643 iunlock(ip);
644 return ip;
645 }
646 if((next = dirlookup(ip, name, 0)) == 0){
647 iunlockput(ip);
648 return 0;
649 }
650 iunlockput(ip);
651 ip = next;
652 }
653 if(nameiparent){
654 iput(ip);
655 return 0;
656 }
657 return ip;
658}
659
660struct inode*
661namei(char *path)
662{
663 char name[DIRSIZ];
664 return namex(path, 0, name);
665}
666
667struct inode*
668nameiparent(char *path, char *name)
669{
670 return namex(path, 1, name);
671}
672