1 | // File system implementation. Five layers: |
2 | // + Blocks: allocator for raw disk blocks. |
3 | // + Log: crash recovery for multi-step updates. |
4 | // + Files: inode allocator, reading, writing, metadata. |
5 | // + Directories: inode with special contents (list of other inodes!) |
6 | // + Names: paths like /usr/rtm/xv6/fs.c for convenient naming. |
7 | // |
8 | // This file contains the low-level file system manipulation |
9 | // routines. The (higher-level) system call implementations |
10 | // are in sysfile.c. |
11 | |
12 | #include "types.h" |
13 | #include "defs.h" |
14 | #include "param.h" |
15 | #include "stat.h" |
16 | #include "mmu.h" |
17 | #include "proc.h" |
18 | #include "spinlock.h" |
19 | #include "sleeplock.h" |
20 | #include "fs.h" |
21 | #include "buf.h" |
22 | #include "file.h" |
23 | |
24 | #define min(a, b) ((a) < (b) ? (a) : (b)) |
25 | static void itrunc(struct inode*); |
26 | // there should be one superblock per disk device, but we run with |
27 | // only one device |
28 | struct superblock sb; |
29 | |
30 | // Read the super block. |
31 | void |
32 | readsb(int dev, struct superblock *sb) |
33 | { |
34 | struct buf *bp; |
35 | |
36 | bp = bread(dev, 1); |
37 | memmove(sb, bp->data, sizeof(*sb)); |
38 | brelse(bp); |
39 | } |
40 | |
41 | // Zero a block. |
42 | static void |
43 | bzero(int dev, int bno) |
44 | { |
45 | struct buf *bp; |
46 | |
47 | bp = bread(dev, bno); |
48 | memset(bp->data, 0, BSIZE); |
49 | log_write(bp); |
50 | brelse(bp); |
51 | } |
52 | |
53 | // Blocks. |
54 | |
55 | // Allocate a zeroed disk block. |
56 | static uint |
57 | balloc(uint dev) |
58 | { |
59 | int b, bi, m; |
60 | struct buf *bp; |
61 | |
62 | bp = 0; |
63 | for(b = 0; b < sb.size; b += BPB){ |
64 | bp = bread(dev, BBLOCK(b, sb)); |
65 | for(bi = 0; bi < BPB && b + bi < sb.size; bi++){ |
66 | m = 1 << (bi % 8); |
67 | if((bp->data[bi/8] & m) == 0){ // Is block free? |
68 | bp->data[bi/8] |= m; // Mark block in use. |
69 | log_write(bp); |
70 | brelse(bp); |
71 | bzero(dev, b + bi); |
72 | return b + bi; |
73 | } |
74 | } |
75 | brelse(bp); |
76 | } |
77 | panic("balloc: out of blocks" ); |
78 | } |
79 | |
80 | // Free a disk block. |
81 | static void |
82 | bfree(int dev, uint b) |
83 | { |
84 | struct buf *bp; |
85 | int bi, m; |
86 | |
87 | readsb(dev, &sb); |
88 | bp = bread(dev, BBLOCK(b, sb)); |
89 | bi = b % BPB; |
90 | m = 1 << (bi % 8); |
91 | if((bp->data[bi/8] & m) == 0) |
92 | panic("freeing free block" ); |
93 | bp->data[bi/8] &= ~m; |
94 | log_write(bp); |
95 | brelse(bp); |
96 | } |
97 | |
98 | // Inodes. |
99 | // |
100 | // An inode describes a single unnamed file. |
101 | // The inode disk structure holds metadata: the file's type, |
102 | // its size, the number of links referring to it, and the |
103 | // list of blocks holding the file's content. |
104 | // |
105 | // The inodes are laid out sequentially on disk at |
106 | // sb.startinode. Each inode has a number, indicating its |
107 | // position on the disk. |
108 | // |
109 | // The kernel keeps a cache of in-use inodes in memory |
110 | // to provide a place for synchronizing access |
111 | // to inodes used by multiple processes. The cached |
112 | // inodes include book-keeping information that is |
113 | // not stored on disk: ip->ref and ip->valid. |
114 | // |
115 | // An inode and its in-memory representation go through a |
116 | // sequence of states before they can be used by the |
117 | // rest of the file system code. |
118 | // |
119 | // * Allocation: an inode is allocated if its type (on disk) |
120 | // is non-zero. ialloc() allocates, and iput() frees if |
121 | // the reference and link counts have fallen to zero. |
122 | // |
123 | // * Referencing in cache: an entry in the inode cache |
124 | // is free if ip->ref is zero. Otherwise ip->ref tracks |
125 | // the number of in-memory pointers to the entry (open |
126 | // files and current directories). iget() finds or |
127 | // creates a cache entry and increments its ref; iput() |
128 | // decrements ref. |
129 | // |
130 | // * Valid: the information (type, size, &c) in an inode |
131 | // cache entry is only correct when ip->valid is 1. |
132 | // ilock() reads the inode from |
133 | // the disk and sets ip->valid, while iput() clears |
134 | // ip->valid if ip->ref has fallen to zero. |
135 | // |
136 | // * Locked: file system code may only examine and modify |
137 | // the information in an inode and its content if it |
138 | // has first locked the inode. |
139 | // |
140 | // Thus a typical sequence is: |
141 | // ip = iget(dev, inum) |
142 | // ilock(ip) |
143 | // ... examine and modify ip->xxx ... |
144 | // iunlock(ip) |
145 | // iput(ip) |
146 | // |
147 | // ilock() is separate from iget() so that system calls can |
148 | // get a long-term reference to an inode (as for an open file) |
149 | // and only lock it for short periods (e.g., in read()). |
150 | // The separation also helps avoid deadlock and races during |
151 | // pathname lookup. iget() increments ip->ref so that the inode |
152 | // stays cached and pointers to it remain valid. |
153 | // |
154 | // Many internal file system functions expect the caller to |
155 | // have locked the inodes involved; this lets callers create |
156 | // multi-step atomic operations. |
157 | // |
158 | // The icache.lock spin-lock protects the allocation of icache |
159 | // entries. Since ip->ref indicates whether an entry is free, |
160 | // and ip->dev and ip->inum indicate which i-node an entry |
161 | // holds, one must hold icache.lock while using any of those fields. |
162 | // |
163 | // An ip->lock sleep-lock protects all ip-> fields other than ref, |
164 | // dev, and inum. One must hold ip->lock in order to |
165 | // read or write that inode's ip->valid, ip->size, ip->type, &c. |
166 | |
167 | struct { |
168 | struct spinlock lock; |
169 | struct inode inode[NINODE]; |
170 | } icache; |
171 | |
172 | void |
173 | iinit(int dev) |
174 | { |
175 | int i = 0; |
176 | |
177 | initlock(&icache.lock, "icache" ); |
178 | for(i = 0; i < NINODE; i++) { |
179 | initsleeplock(&icache.inode[i].lock, "inode" ); |
180 | } |
181 | |
182 | readsb(dev, &sb); |
183 | cprintf("sb: size %d nblocks %d ninodes %d nlog %d logstart %d\ |
184 | inodestart %d bmap start %d\n" , sb.size, sb.nblocks, |
185 | sb.ninodes, sb.nlog, sb.logstart, sb.inodestart, |
186 | sb.bmapstart); |
187 | } |
188 | |
189 | static struct inode* iget(uint dev, uint inum); |
190 | |
191 | //PAGEBREAK! |
192 | // Allocate an inode on device dev. |
193 | // Mark it as allocated by giving it type type. |
194 | // Returns an unlocked but allocated and referenced inode. |
195 | struct inode* |
196 | ialloc(uint dev, short type) |
197 | { |
198 | int inum; |
199 | struct buf *bp; |
200 | struct dinode *dip; |
201 | |
202 | for(inum = 1; inum < sb.ninodes; inum++){ |
203 | bp = bread(dev, IBLOCK(inum, sb)); |
204 | dip = (struct dinode*)bp->data + inum%IPB; |
205 | if(dip->type == 0){ // a free inode |
206 | memset(dip, 0, sizeof(*dip)); |
207 | dip->type = type; |
208 | log_write(bp); // mark it allocated on the disk |
209 | brelse(bp); |
210 | return iget(dev, inum); |
211 | } |
212 | brelse(bp); |
213 | } |
214 | panic("ialloc: no inodes" ); |
215 | } |
216 | |
217 | // Copy a modified in-memory inode to disk. |
218 | // Must be called after every change to an ip->xxx field |
219 | // that lives on disk, since i-node cache is write-through. |
220 | // Caller must hold ip->lock. |
221 | void |
222 | iupdate(struct inode *ip) |
223 | { |
224 | struct buf *bp; |
225 | struct dinode *dip; |
226 | |
227 | bp = bread(ip->dev, IBLOCK(ip->inum, sb)); |
228 | dip = (struct dinode*)bp->data + ip->inum%IPB; |
229 | dip->type = ip->type; |
230 | dip->major = ip->major; |
231 | dip->minor = ip->minor; |
232 | dip->nlink = ip->nlink; |
233 | dip->size = ip->size; |
234 | memmove(dip->addrs, ip->addrs, sizeof(ip->addrs)); |
235 | log_write(bp); |
236 | brelse(bp); |
237 | } |
238 | |
239 | // Find the inode with number inum on device dev |
240 | // and return the in-memory copy. Does not lock |
241 | // the inode and does not read it from disk. |
242 | static struct inode* |
243 | iget(uint dev, uint inum) |
244 | { |
245 | struct inode *ip, *empty; |
246 | |
247 | acquire(&icache.lock); |
248 | |
249 | // Is the inode already cached? |
250 | empty = 0; |
251 | for(ip = &icache.inode[0]; ip < &icache.inode[NINODE]; ip++){ |
252 | if(ip->ref > 0 && ip->dev == dev && ip->inum == inum){ |
253 | ip->ref++; |
254 | release(&icache.lock); |
255 | return ip; |
256 | } |
257 | if(empty == 0 && ip->ref == 0) // Remember empty slot. |
258 | empty = ip; |
259 | } |
260 | |
261 | // Recycle an inode cache entry. |
262 | if(empty == 0) |
263 | panic("iget: no inodes" ); |
264 | |
265 | ip = empty; |
266 | ip->dev = dev; |
267 | ip->inum = inum; |
268 | ip->ref = 1; |
269 | ip->valid = 0; |
270 | release(&icache.lock); |
271 | |
272 | return ip; |
273 | } |
274 | |
275 | // Increment reference count for ip. |
276 | // Returns ip to enable ip = idup(ip1) idiom. |
277 | struct inode* |
278 | idup(struct inode *ip) |
279 | { |
280 | acquire(&icache.lock); |
281 | ip->ref++; |
282 | release(&icache.lock); |
283 | return ip; |
284 | } |
285 | |
286 | // Lock the given inode. |
287 | // Reads the inode from disk if necessary. |
288 | void |
289 | ilock(struct inode *ip) |
290 | { |
291 | struct buf *bp; |
292 | struct dinode *dip; |
293 | |
294 | if(ip == 0 || ip->ref < 1) |
295 | panic("ilock" ); |
296 | |
297 | acquiresleep(&ip->lock); |
298 | |
299 | if(ip->valid == 0){ |
300 | bp = bread(ip->dev, IBLOCK(ip->inum, sb)); |
301 | dip = (struct dinode*)bp->data + ip->inum%IPB; |
302 | ip->type = dip->type; |
303 | ip->major = dip->major; |
304 | ip->minor = dip->minor; |
305 | ip->nlink = dip->nlink; |
306 | ip->size = dip->size; |
307 | memmove(ip->addrs, dip->addrs, sizeof(ip->addrs)); |
308 | brelse(bp); |
309 | ip->valid = 1; |
310 | if(ip->type == 0) |
311 | panic("ilock: no type" ); |
312 | } |
313 | } |
314 | |
315 | // Unlock the given inode. |
316 | void |
317 | iunlock(struct inode *ip) |
318 | { |
319 | if(ip == 0 || !holdingsleep(&ip->lock) || ip->ref < 1) |
320 | panic("iunlock" ); |
321 | |
322 | releasesleep(&ip->lock); |
323 | } |
324 | |
325 | // Drop a reference to an in-memory inode. |
326 | // If that was the last reference, the inode cache entry can |
327 | // be recycled. |
328 | // If that was the last reference and the inode has no links |
329 | // to it, free the inode (and its content) on disk. |
330 | // All calls to iput() must be inside a transaction in |
331 | // case it has to free the inode. |
332 | void |
333 | iput(struct inode *ip) |
334 | { |
335 | acquiresleep(&ip->lock); |
336 | if(ip->valid && ip->nlink == 0){ |
337 | acquire(&icache.lock); |
338 | int r = ip->ref; |
339 | release(&icache.lock); |
340 | if(r == 1){ |
341 | // inode has no links and no other references: truncate and free. |
342 | itrunc(ip); |
343 | ip->type = 0; |
344 | iupdate(ip); |
345 | ip->valid = 0; |
346 | } |
347 | } |
348 | releasesleep(&ip->lock); |
349 | |
350 | acquire(&icache.lock); |
351 | ip->ref--; |
352 | release(&icache.lock); |
353 | } |
354 | |
355 | // Common idiom: unlock, then put. |
356 | void |
357 | iunlockput(struct inode *ip) |
358 | { |
359 | iunlock(ip); |
360 | iput(ip); |
361 | } |
362 | |
363 | //PAGEBREAK! |
364 | // Inode content |
365 | // |
366 | // The content (data) associated with each inode is stored |
367 | // in blocks on the disk. The first NDIRECT block numbers |
368 | // are listed in ip->addrs[]. The next NINDIRECT blocks are |
369 | // listed in block ip->addrs[NDIRECT]. |
370 | |
371 | // Return the disk block address of the nth block in inode ip. |
372 | // If there is no such block, bmap allocates one. |
373 | static uint |
374 | bmap(struct inode *ip, uint bn) |
375 | { |
376 | uint addr, *a; |
377 | struct buf *bp; |
378 | |
379 | if(bn < NDIRECT){ |
380 | if((addr = ip->addrs[bn]) == 0) |
381 | ip->addrs[bn] = addr = balloc(ip->dev); |
382 | return addr; |
383 | } |
384 | bn -= NDIRECT; |
385 | |
386 | if(bn < NINDIRECT){ |
387 | // Load indirect block, allocating if necessary. |
388 | if((addr = ip->addrs[NDIRECT]) == 0) |
389 | ip->addrs[NDIRECT] = addr = balloc(ip->dev); |
390 | bp = bread(ip->dev, addr); |
391 | a = (uint*)bp->data; |
392 | if((addr = a[bn]) == 0){ |
393 | a[bn] = addr = balloc(ip->dev); |
394 | log_write(bp); |
395 | } |
396 | brelse(bp); |
397 | return addr; |
398 | } |
399 | |
400 | panic("bmap: out of range" ); |
401 | } |
402 | |
403 | // Truncate inode (discard contents). |
404 | // Only called when the inode has no links |
405 | // to it (no directory entries referring to it) |
406 | // and has no in-memory reference to it (is |
407 | // not an open file or current directory). |
408 | static void |
409 | itrunc(struct inode *ip) |
410 | { |
411 | int i, j; |
412 | struct buf *bp; |
413 | uint *a; |
414 | |
415 | for(i = 0; i < NDIRECT; i++){ |
416 | if(ip->addrs[i]){ |
417 | bfree(ip->dev, ip->addrs[i]); |
418 | ip->addrs[i] = 0; |
419 | } |
420 | } |
421 | |
422 | if(ip->addrs[NDIRECT]){ |
423 | bp = bread(ip->dev, ip->addrs[NDIRECT]); |
424 | a = (uint*)bp->data; |
425 | for(j = 0; j < NINDIRECT; j++){ |
426 | if(a[j]) |
427 | bfree(ip->dev, a[j]); |
428 | } |
429 | brelse(bp); |
430 | bfree(ip->dev, ip->addrs[NDIRECT]); |
431 | ip->addrs[NDIRECT] = 0; |
432 | } |
433 | |
434 | ip->size = 0; |
435 | iupdate(ip); |
436 | } |
437 | |
438 | // Copy stat information from inode. |
439 | // Caller must hold ip->lock. |
440 | void |
441 | stati(struct inode *ip, struct stat *st) |
442 | { |
443 | st->dev = ip->dev; |
444 | st->ino = ip->inum; |
445 | st->type = ip->type; |
446 | st->nlink = ip->nlink; |
447 | st->size = ip->size; |
448 | } |
449 | |
450 | //PAGEBREAK! |
451 | // Read data from inode. |
452 | // Caller must hold ip->lock. |
453 | int |
454 | readi(struct inode *ip, char *dst, uint off, uint n) |
455 | { |
456 | uint tot, m; |
457 | struct buf *bp; |
458 | |
459 | if(ip->type == T_DEV){ |
460 | if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].read) |
461 | return -1; |
462 | return devsw[ip->major].read(ip, dst, n); |
463 | } |
464 | |
465 | if(off > ip->size || off + n < off) |
466 | return -1; |
467 | if(off + n > ip->size) |
468 | n = ip->size - off; |
469 | |
470 | for(tot=0; tot<n; tot+=m, off+=m, dst+=m){ |
471 | bp = bread(ip->dev, bmap(ip, off/BSIZE)); |
472 | m = min(n - tot, BSIZE - off%BSIZE); |
473 | memmove(dst, bp->data + off%BSIZE, m); |
474 | brelse(bp); |
475 | } |
476 | return n; |
477 | } |
478 | |
479 | // PAGEBREAK! |
480 | // Write data to inode. |
481 | // Caller must hold ip->lock. |
482 | int |
483 | writei(struct inode *ip, char *src, uint off, uint n) |
484 | { |
485 | uint tot, m; |
486 | struct buf *bp; |
487 | |
488 | if(ip->type == T_DEV){ |
489 | if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].write) |
490 | return -1; |
491 | return devsw[ip->major].write(ip, src, n); |
492 | } |
493 | |
494 | if(off > ip->size || off + n < off) |
495 | return -1; |
496 | if(off + n > MAXFILE*BSIZE) |
497 | return -1; |
498 | |
499 | for(tot=0; tot<n; tot+=m, off+=m, src+=m){ |
500 | bp = bread(ip->dev, bmap(ip, off/BSIZE)); |
501 | m = min(n - tot, BSIZE - off%BSIZE); |
502 | memmove(bp->data + off%BSIZE, src, m); |
503 | log_write(bp); |
504 | brelse(bp); |
505 | } |
506 | |
507 | if(n > 0 && off > ip->size){ |
508 | ip->size = off; |
509 | iupdate(ip); |
510 | } |
511 | return n; |
512 | } |
513 | |
514 | //PAGEBREAK! |
515 | // Directories |
516 | |
517 | int |
518 | namecmp(const char *s, const char *t) |
519 | { |
520 | return strncmp(s, t, DIRSIZ); |
521 | } |
522 | |
523 | // Look for a directory entry in a directory. |
524 | // If found, set *poff to byte offset of entry. |
525 | struct inode* |
526 | dirlookup(struct inode *dp, char *name, uint *poff) |
527 | { |
528 | uint off, inum; |
529 | struct dirent de; |
530 | |
531 | if(dp->type != T_DIR) |
532 | panic("dirlookup not DIR" ); |
533 | |
534 | for(off = 0; off < dp->size; off += sizeof(de)){ |
535 | if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de)) |
536 | panic("dirlookup read" ); |
537 | if(de.inum == 0) |
538 | continue; |
539 | if(namecmp(name, de.name) == 0){ |
540 | // entry matches path element |
541 | if(poff) |
542 | *poff = off; |
543 | inum = de.inum; |
544 | return iget(dp->dev, inum); |
545 | } |
546 | } |
547 | |
548 | return 0; |
549 | } |
550 | |
551 | // Write a new directory entry (name, inum) into the directory dp. |
552 | int |
553 | dirlink(struct inode *dp, char *name, uint inum) |
554 | { |
555 | int off; |
556 | struct dirent de; |
557 | struct inode *ip; |
558 | |
559 | // Check that name is not present. |
560 | if((ip = dirlookup(dp, name, 0)) != 0){ |
561 | iput(ip); |
562 | return -1; |
563 | } |
564 | |
565 | // Look for an empty dirent. |
566 | for(off = 0; off < dp->size; off += sizeof(de)){ |
567 | if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de)) |
568 | panic("dirlink read" ); |
569 | if(de.inum == 0) |
570 | break; |
571 | } |
572 | |
573 | strncpy(de.name, name, DIRSIZ); |
574 | de.inum = inum; |
575 | if(writei(dp, (char*)&de, off, sizeof(de)) != sizeof(de)) |
576 | panic("dirlink" ); |
577 | |
578 | return 0; |
579 | } |
580 | |
581 | //PAGEBREAK! |
582 | // Paths |
583 | |
584 | // Copy the next path element from path into name. |
585 | // Return a pointer to the element following the copied one. |
586 | // The returned path has no leading slashes, |
587 | // so the caller can check *path=='\0' to see if the name is the last one. |
588 | // If no name to remove, return 0. |
589 | // |
590 | // Examples: |
591 | // skipelem("a/bb/c", name) = "bb/c", setting name = "a" |
592 | // skipelem("///a//bb", name) = "bb", setting name = "a" |
593 | // skipelem("a", name) = "", setting name = "a" |
594 | // skipelem("", name) = skipelem("////", name) = 0 |
595 | // |
596 | static char* |
597 | skipelem(char *path, char *name) |
598 | { |
599 | char *s; |
600 | int len; |
601 | |
602 | while(*path == '/') |
603 | path++; |
604 | if(*path == 0) |
605 | return 0; |
606 | s = path; |
607 | while(*path != '/' && *path != 0) |
608 | path++; |
609 | len = path - s; |
610 | if(len >= DIRSIZ) |
611 | memmove(name, s, DIRSIZ); |
612 | else { |
613 | memmove(name, s, len); |
614 | name[len] = 0; |
615 | } |
616 | while(*path == '/') |
617 | path++; |
618 | return path; |
619 | } |
620 | |
621 | // Look up and return the inode for a path name. |
622 | // If parent != 0, return the inode for the parent and copy the final |
623 | // path element into name, which must have room for DIRSIZ bytes. |
624 | // Must be called inside a transaction since it calls iput(). |
625 | static struct inode* |
626 | namex(char *path, int nameiparent, char *name) |
627 | { |
628 | struct inode *ip, *next; |
629 | |
630 | if(*path == '/') |
631 | ip = iget(ROOTDEV, ROOTINO); |
632 | else |
633 | ip = idup(myproc()->cwd); |
634 | |
635 | while((path = skipelem(path, name)) != 0){ |
636 | ilock(ip); |
637 | if(ip->type != T_DIR){ |
638 | iunlockput(ip); |
639 | return 0; |
640 | } |
641 | if(nameiparent && *path == '\0'){ |
642 | // Stop one level early. |
643 | iunlock(ip); |
644 | return ip; |
645 | } |
646 | if((next = dirlookup(ip, name, 0)) == 0){ |
647 | iunlockput(ip); |
648 | return 0; |
649 | } |
650 | iunlockput(ip); |
651 | ip = next; |
652 | } |
653 | if(nameiparent){ |
654 | iput(ip); |
655 | return 0; |
656 | } |
657 | return ip; |
658 | } |
659 | |
660 | struct inode* |
661 | namei(char *path) |
662 | { |
663 | char name[DIRSIZ]; |
664 | return namex(path, 0, name); |
665 | } |
666 | |
667 | struct inode* |
668 | nameiparent(char *path, char *name) |
669 | { |
670 | return namex(path, 1, name); |
671 | } |
672 | |