This documentation is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
For more details see the file COPYING in the source distribution of Linux.
Table of Contents
n
to the eventfd counter.
Table of Contents
enum positive_aop_returns — aop return codes with specific semantics
enum positive_aop_returns { AOP_WRITEPAGE_ACTIVATE, AOP_TRUNCATED_PAGE };
Informs the caller that page writeback has
completed, that the page is still locked, and
should be considered active. The VM uses this hint
to return the page to the active list -- it won't
be a candidate for writeback again in the near
future. Other callers must be careful to unlock
the page if they get this return. Returned by
writepage
;
The AOP method that was handed a locked page has
unlocked it and the page might have been truncated.
The caller should back up to acquiring a new page and
trying again. The aop will be taking reasonable
precautions not to livelock. If the caller held a page
reference, it should drop it before retrying. Returned
by readpage
.
inc_nlink — directly increment an inode's link count
void inc_nlink ( | inode) ; |
struct inode * inode
;drop_nlink — directly drop an inode's link count
void drop_nlink ( | inode) ; |
struct inode * inode
;__d_drop — drop a dentry
void __d_drop ( | dentry) ; |
struct dentry * dentry
;
d_drop
unhashes the entry from the parent dentry hashes, so that it won't
be found through a VFS lookup any more. Note that this is different from
deleting the dentry - d_delete will try to mark the dentry negative if
possible, giving a successful _negative_ lookup, while d_drop will
just make the cache lookup fail.
d_drop
is used mainly for stuff that wants to invalidate a dentry for some
reason (NFS timeouts or autofs deletes).
__d_drop requires dentry->d_lock.
shrink_dcache_sb — shrink dcache for a superblock
void shrink_dcache_sb ( | sb) ; |
struct super_block * sb
;have_submounts — check for mounts over a dentry
int have_submounts ( | parent) ; |
struct dentry * parent
;shrink_dcache_parent — prune dcache
void shrink_dcache_parent ( | parent) ; |
struct dentry * parent
;d_alloc — allocate a dcache entry
struct dentry * d_alloc ( | parent, | |
name) ; |
struct dentry * parent
;const struct qstr * name
;d_instantiate — fill in inode information for a dentry
void d_instantiate ( | entry, | |
inode) ; |
struct dentry * entry
;struct inode * inode
;d_alloc_root — allocate root dentry
struct dentry * d_alloc_root ( | root_inode) ; |
struct inode * root_inode
;d_obtain_alias — find or allocate a dentry for a given inode
struct dentry * d_obtain_alias ( | inode) ; |
struct inode * inode
;Obtain a dentry for an inode resulting from NFS filehandle conversion or similar open by handle operations. The returned dentry may be anonymous, or may have a full name (if the inode was already in the cache).
When called on a directory inode, we must ensure that the inode only ever has one dentry. If a dentry is found, that is returned instead of allocating a new one.
On successful return, the reference to the inode has been transferred
to the dentry. In case of an error the reference on the inode is released.
To make it easier to use in export operations a NULL
or IS_ERR inode may
be passed in and will be the error will be propagate to the return value,
with a NULL
inode
replaced by ERR_PTR(-ESTALE).
d_splice_alias — splice a disconnected dentry into the tree if one exists
struct dentry * d_splice_alias ( | inode, | |
dentry) ; |
struct inode * inode
;struct dentry * dentry
;inode
the inode which may have a disconnected dentry
dentry
a negative dentry which we want to point to the inode.
If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and DCACHE_DISCONNECTED), then d_move that in place of the given dentry and return it, else simply d_add the inode to the dentry and return NULL.
This is needed in the lookup routine of any filesystem that is exportable (via knfsd) so that we can build dcache paths to directories effectively.
If a dentry was found and moved, then it is returned. Otherwise NULL is returned. This matches the expected return value of ->lookup.
d_add_ci — lookup or allocate new dentry with case-exact name
struct dentry * d_add_ci ( | dentry, | |
inode, | ||
name) ; |
struct dentry * dentry
;struct inode * inode
;struct qstr * name
;dentry
the negative dentry that was passed to the parent's lookup func
inode
the inode case-insensitive lookup has found
name
the case-exact name to be associated with the returned dentry
This is to avoid filling the dcache with case-insensitive names to the same inode, only the actual correct case is stored in the dcache for case-insensitive filesystems.
For a case-insensitive lookup match and if the the case-exact dentry already exists in in the dcache, use it and return it.
If no entry exists with the exact case name, allocate new dentry with the exact case, and return the spliced entry.
d_lookup — search for a dentry
struct dentry * d_lookup ( | parent, | |
name) ; |
struct dentry * parent
;struct qstr * name
;dentry, or NULL
d_lookup searches the children of the parent dentry for the name in
question. If the dentry is found its reference count is incremented and the
dentry is returned. The caller must use dput to free the entry when it has
finished using it. NULL
is returned if the dentry does not exist.
d_validate — verify dentry provided from insecure source (deprecated)
int d_validate ( | dentry, | |
dparent) ; |
struct dentry * dentry
;struct dentry * dparent
;dentry_update_name_case — update case insensitive dentry with a new name
void dentry_update_name_case ( | dentry, | |
name) ; |
struct dentry * dentry
;struct qstr * name
;Update a case insensitive dentry with new case of name.
dentry must have been returned by d_lookup with name name
. Old and new
name lengths must match (ie. no d_compare which allows mismatched name
lengths).
Parent inode i_mutex must be held over d_lookup and into this call (to keep renames and concurrent inserts, and readdir(2) away).
d_materialise_unique — introduce an inode into the tree
struct dentry * d_materialise_unique ( | dentry, | |
inode) ; |
struct dentry * dentry
;struct inode * inode
;d_path — return the path of a dentry
char * d_path ( | path, | |
buf, | ||
buflen) ; |
const struct path * path
;char * buf
;int buflen
;Convert a dentry into an ASCII path name. If the entry has been deleted the string “ (deleted)” is appended. Note that this is ambiguous.
Returns a pointer into the buffer or an error code if the path was too long. Note: Callers should use the returned pointer, not the passed in buffer, to use the name! The implementation often starts at an offset into the buffer, and may leave 0 bytes at the start.
“buflen” should be positive.
find_inode_number — check for dentry with name
ino_t find_inode_number ( | dir, | |
name) ; |
struct dentry * dir
;struct qstr * name
;d_add — add dentry to hash queues
void d_add ( | entry, | |
inode) ; |
struct dentry * entry
;struct inode * inode
;d_add_unique — add dentry to hash queues without aliasing
struct dentry * d_add_unique ( | entry, | |
inode) ; |
struct dentry * entry
;struct inode * inode
;__d_rcu_to_refcount — take a refcount on dentry if sequence check is ok
int __d_rcu_to_refcount ( | dentry, | |
seq) ; |
struct dentry * dentry
;unsigned seq
;inode_init_always — perform inode structure intialisation
int inode_init_always ( | sb, | |
inode) ; |
struct super_block * sb
;struct inode * inode
;inode_sb_list_add — add inode to the superblock list of inodes
void inode_sb_list_add ( | inode) ; |
struct inode * inode
;__insert_inode_hash — hash an inode
void __insert_inode_hash ( | inode, | |
hashval) ; |
struct inode * inode
;unsigned long hashval
;remove_inode_hash — remove an inode from the hash
void remove_inode_hash ( | inode) ; |
struct inode * inode
;new_inode — obtain an inode
struct inode * new_inode ( | sb) ; |
struct super_block * sb
;
Allocates a new inode for given superblock. The default gfp_mask
for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
If HIGHMEM pages are unsuitable or it is known that pages allocated
for the page cache are not reclaimable or migratable,
mapping_set_gfp_mask
must be called with suitable flags on the
newly created inode's mapping
unlock_new_inode — clear the I_NEW state and wake up any waiters
void unlock_new_inode ( | inode) ; |
struct inode * inode
;iget5_locked — obtain an inode from a mounted file system
struct inode * iget5_locked ( | sb, | |
hashval, | ||
test, | ||
set, | ||
data) ; |
struct super_block * sb
;unsigned long hashval
;int (*test)
(
struct inode *, void *)
;int (*set)
(
struct inode *, void *)
;void * data
;sb
super block of file system
hashval
hash value (usually inode number) to get
test
callback used for comparisons between inodes
set
callback used to initialize a new struct inode
data
opaque data pointer to pass to test
and set
Search for the inode specified by hashval
and data
in the inode cache,
and if present it is return it with an increased reference count. This is
a generalized version of iget_locked
for file systems where the inode
number is not sufficient for unique identification of an inode.
If the inode is not in cache, allocate a new inode and return it locked,
hashed, and with the I_NEW flag set. The file system gets to fill it in
before unlocking it via unlock_new_inode
.
Note both test
and set
are called with the inode_hash_lock held, so can't
sleep.
iget_locked — obtain an inode from a mounted file system
struct inode * iget_locked ( | sb, | |
ino) ; |
struct super_block * sb
;unsigned long ino
;
Search for the inode specified by ino
in the inode cache and if present
return it with an increased reference count. This is for file systems
where the inode number is sufficient for unique identification of an inode.
If the inode is not in cache, allocate a new inode and return it locked,
hashed, and with the I_NEW flag set. The file system gets to fill it in
before unlocking it via unlock_new_inode
.
iunique — get a unique inode number
ino_t iunique ( | sb, | |
max_reserved) ; |
struct super_block * sb
;ino_t max_reserved
;ilookup5_nowait — search for an inode in the inode cache
struct inode * ilookup5_nowait ( | sb, | |
hashval, | ||
test, | ||
data) ; |
struct super_block * sb
;unsigned long hashval
;int (*test)
(
struct inode *, void *)
;void * data
;sb
super block of file system to search
hashval
hash value (usually inode number) to search for
test
callback used for comparisons between inodes
data
opaque data pointer to pass to test
Search for the inode specified by hashval
and data
in the inode cache.
If the inode is in the cache, the inode is returned with an incremented
reference count.
ilookup5 — search for an inode in the inode cache
struct inode * ilookup5 ( | sb, | |
hashval, | ||
test, | ||
data) ; |
struct super_block * sb
;unsigned long hashval
;int (*test)
(
struct inode *, void *)
;void * data
;sb
super block of file system to search
hashval
hash value (usually inode number) to search for
test
callback used for comparisons between inodes
data
opaque data pointer to pass to test
Search for the inode specified by hashval
and data
in the inode cache,
and if the inode is in the cache, return the inode with an incremented
reference count. Waits on I_NEW before returning the inode.
returned with an incremented reference count.
This is a generalized version of ilookup
for file systems where the
inode number is not sufficient for unique identification of an inode.
ilookup — search for an inode in the inode cache
struct inode * ilookup ( | sb, | |
ino) ; |
struct super_block * sb
;unsigned long ino
;bmap — find a block number in a file
sector_t bmap ( | inode, | |
block) ; |
struct inode * inode
;sector_t block
;touch_atime — update the access time
void touch_atime ( | mnt, | |
dentry) ; |
struct vfsmount * mnt
;struct dentry * dentry
;file_update_time — update mtime and ctime time
void file_update_time ( | file) ; |
struct file * file
;Update the mtime and ctime members of an inode and mark the inode for writeback. Note that this function is meant exclusively for usage in the file write path of filesystems, and filesystems may choose to explicitly ignore update via this function with the S_NOCMTIME inode flag, e.g. for network filesystem where these timestamps are handled by the server.
inode_init_owner — Init uid,gid,mode for new inode according to posix standards
void inode_init_owner ( | inode, | |
dir, | ||
mode) ; |
struct inode * inode
;const struct inode * dir
;mode_t mode
;inode_owner_or_capable — check current task permissions to inode
bool inode_owner_or_capable ( | inode) ; |
const struct inode * inode
;make_bad_inode — mark an inode bad due to an I/O error
void make_bad_inode ( | inode) ; |
struct inode * inode
;deactivate_locked_super — drop an active reference to superblock
void deactivate_locked_super ( | s) ; |
struct super_block * s
;deactivate_super — drop an active reference to superblock
void deactivate_super ( | s) ; |
struct super_block * s
;generic_shutdown_super —
common helper for ->kill_sb
void generic_shutdown_super ( | sb) ; |
struct super_block * sb
;
generic_shutdown_super
does all fs-independent work on superblock
shutdown. Typical ->kill_sb
should pick all fs-specific objects
that need destruction out of superblock, call generic_shutdown_super
and release aforementioned objects. Note: dentries and inodes _are_
taken care of and do not need specific handling.
Upon calling this function, the filesystem may no longer alter or rearrange the set of dentries belonging to this super_block, nor may it change the attachments of dentries to inodes.
sget — find or create a superblock
struct super_block * sget ( | type, | |
test, | ||
set, | ||
data) ; |
struct file_system_type * type
;int (*test)
(
struct super_block *,void *)
;int (*set)
(
struct super_block *,void *)
;void * data
;get_super — get the superblock of a device
struct super_block * get_super ( | bdev) ; |
struct block_device * bdev
;posix_lock_file — Apply a POSIX-style lock to a file
int posix_lock_file ( | filp, | |
fl, | ||
conflock) ; |
struct file * filp
;struct file_lock * fl
;struct file_lock * conflock
;filp
The file to apply the lock to
fl
The lock to be applied
conflock
Place to return a copy of the conflicting lock, if found.
Add a POSIX style lock to a file. We merge adjacent & overlapping locks whenever possible. POSIX locks are sorted by owner task, then by starting address
Note that if called with an FL_EXISTS argument, the caller may determine whether or not a lock was successfully freed by testing the return value for -ENOENT.
posix_lock_file_wait — Apply a POSIX-style lock to a file
int posix_lock_file_wait ( | filp, | |
fl) ; |
struct file * filp
;struct file_lock * fl
;locks_mandatory_area — Check for a conflicting lock
int locks_mandatory_area ( | read_write, | |
inode, | ||
filp, | ||
offset, | ||
count) ; |
int read_write
;struct inode * inode
;struct file * filp
;loff_t offset
;size_t count
;__break_lease — revoke all outstanding leases on file
int __break_lease ( | inode, | |
mode) ; |
struct inode * inode
;unsigned int mode
;lease_get_mtime — get the last modified time of an inode
void lease_get_mtime ( | inode, | |
time) ; |
struct inode * inode
;struct timespec * time
;generic_setlease — sets a lease on an open file
int generic_setlease ( | filp, | |
arg, | ||
flp) ; |
struct file * filp
;long arg
;struct file_lock ** flp
;vfs_setlease — sets a lease on an open file
int vfs_setlease ( | filp, | |
arg, | ||
lease) ; |
struct file * filp
;long arg
;struct file_lock ** lease
;Call this to establish a lease on the file. The (*lease)->fl_lmops->fl_break operation must be set; if not, break_lease will oops!
This will call the filesystem's setlease file method, if
defined. Note that there is no getlease method; instead, the
filesystem setlease method should call back to setlease
to
add a lease to the inode's lease list, where fcntl_getlease
can
find it. Since fcntl_getlease
only reports whether the current
task holds a lease, a cluster filesystem need only do this for
leases held by processes on this node.
There is also no break_lease method; filesystems that handle their own leases should break leases themselves from the filesystem's open, create, and (on truncate) setattr methods.
flock_lock_file_wait — Apply a FLOCK-style lock to a file
int flock_lock_file_wait ( | filp, | |
fl) ; |
struct file * filp
;struct file_lock * fl
;vfs_test_lock — test file byte range lock
int vfs_test_lock ( | filp, | |
fl) ; |
struct file * filp
;struct file_lock * fl
;vfs_lock_file — file byte range lock
int vfs_lock_file ( | filp, | |
cmd, | ||
fl, | ||
conf) ; |
struct file * filp
;unsigned int cmd
;struct file_lock * fl
;struct file_lock * conf
;filp
The file to apply the lock to
cmd
type of locking operation (F_SETLK, F_GETLK, etc.)
fl
The lock to be applied
conf
Place to return a copy of the conflicting lock, if found.
A caller that doesn't care about the conflicting lock may pass NULL as the final argument.
If the filesystem defines a private ->lock
method, then conf
will
be left unchanged; so a caller that cares should initialize it to
some acceptable default.
To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
locks, the ->lock
interface may return asynchronously, before the lock has
been granted or denied by the underlying filesystem, if (and only if)
fl_grant is set. Callers expecting ->lock
to return asynchronously
will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if)
the request is for a blocking lock. When ->lock
does return asynchronously,
it must return FILE_LOCK_DEFERRED, and call ->fl_grant
when the lock
request completes.
If the request is for non-blocking lock the file system should return
FILE_LOCK_DEFERRED then try to get the lock and call the callback routine
with the result. If the request timed out the callback routine will return a
nonzero return code and the file system should release the lock. The file
system is also responsible to keep a corresponding posix lock when it
grants a lock so the VFS can find out which locks are locally held and do
the correct lock cleanup when required.
The underlying filesystem must not drop the kernel lock or call
->fl_grant
before returning to the caller with a FILE_LOCK_DEFERRED
return code.
posix_unblock_lock — stop waiting for a file lock
int posix_unblock_lock ( | filp, | |
waiter) ; |
struct file * filp
;struct file_lock * waiter
;vfs_cancel_lock — file byte range unblock lock
int vfs_cancel_lock ( | filp, | |
fl) ; |
struct file * filp
;struct file_lock * fl
;lock_may_read — checks that the region is free of locks
int lock_may_read ( | inode, | |
start, | ||
len) ; |
struct inode * inode
;loff_t start
;unsigned long len
;lock_may_write — checks that the region is free of locks
int lock_may_write ( | inode, | |
start, | ||
len) ; |
struct inode * inode
;loff_t start
;unsigned long len
;locks_mandatory_locked — Check for an active lock
int locks_mandatory_locked ( | inode) ; |
struct inode * inode
;fcntl_getlease — Enquire what lease is currently active
int fcntl_getlease ( | filp) ; |
struct file * filp
;The value returned by this function will be one of (if no lease break is pending):
F_RDLCK
to indicate a shared lease is held.
F_WRLCK
to indicate an exclusive lease is held.
F_UNLCK
to indicate no lease is held.
(if a lease break is pending):
F_RDLCK
to indicate an exclusive lease needs to be
changed to a shared lease (or removed).
F_UNLCK
to indicate the lease needs to be removed.
fcntl_setlease — sets a lease on an open file
int fcntl_setlease ( | fd, | |
filp, | ||
arg) ; |
unsigned int fd
;struct file * filp
;long arg
;sys_flock —
flock
system call.
long sys_flock ( | fd, | |
cmd) ; |
unsigned int fd
;unsigned int cmd
;
Apply a FL_FLOCK
style lock to an open file descriptor.
The cmd
can be one of
LOCK_SH
-- a shared lock.
LOCK_EX
-- an exclusive lock.
LOCK_UN
-- remove an existing lock.
LOCK_MAND
-- a `mandatory' flock. This exists to emulate Windows Share Modes.
LOCK_MAND
can be combined with LOCK_READ
or LOCK_WRITE
to allow other
processes read and write access respectively.
mpage_readpages — populate an address space with some pages & start reads against them
int mpage_readpages ( | mapping, | |
pages, | ||
nr_pages, | ||
get_block) ; |
struct address_space * mapping
;struct list_head * pages
;unsigned nr_pages
;get_block_t get_block
;mapping
the address_space
pages
The address of a list_head which contains the target pages. These
pages have their ->index populated and are otherwise uninitialised.
The page at pages
->prev has the lowest file offset, and reads should be
issued in pages
->prev to pages
->next order.
nr_pages
The number of pages at *pages
get_block
The filesystem's block mapper function.
This function walks the pages and the blocks within each page, building and emitting large BIOs.
If anything unusual happens, such as:
- encountering a page which has buffers - encountering a page which has a non-hole after a hole - encountering a page with non-contiguous blocks
then this code just gives up and calls the buffer_head-based read function. It does handle a page which has holes at the end - that is a common case: the end-of-file on blocksize < PAGE_CACHE_SIZE setups.
There is a problem. The mpage read code assembles several pages, gets all their disk mappings, and then submits them all. That's fine, but obtaining the disk mappings may require I/O. Reads of indirect blocks, for example.
So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be
12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16
because the indirect block has to be read to get the mappings of blocks 13,14,15,16. Obviously, this impacts performance.
So what we do it to allow the filesystem's get_block
function to set
BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block
after this one will require I/O against a block which is probably close to
this one. So you should push what I/O you have currently accumulated.
This all causes the disk requests to be issued in the correct order.
mpage_writepages —
walk the list of dirty pages of the given address space & writepage
all of them
int mpage_writepages ( | mapping, | |
wbc, | ||
get_block) ; |
struct address_space * mapping
;struct writeback_control * wbc
;get_block_t get_block
;mapping
address space structure to write
wbc
subtract the number of written pages from *wbc
->nr_to_write
get_block
the filesystem's block mapper function. If this is NULL then use a_ops->writepage. Otherwise, go direct-to-BIO.
This is a library function, which implements the writepages
address_space_operation.
If a page is already under I/O, generic_writepages
skips it, even
if it's dirty. This is desirable behaviour for memory-cleaning writeback,
but it is INCORRECT for data-integrity system calls such as fsync
. fsync
and msync
need to guarantee that all the data which was dirty at the time
the call was made get new I/O started against them. If wbc->sync_mode is
WB_SYNC_ALL then we were called for data integrity and we must wait for
existing IO to complete.
generic_permission — check for access rights on a Posix-like filesystem
int generic_permission ( | inode, | |
mask, | ||
flags, | ||
check_acl) ; |
struct inode * inode
;int mask
;unsigned int flags
;int (*check_acl)
(
struct inode *inode, int mask, unsigned int flags)
;inode
inode to check access rights for
mask
right to check for (MAY_READ
, MAY_WRITE
, MAY_EXEC
)
flags
IPERM_FLAG_ flags.
check_acl
optional callback to check for Posix ACLs
Used to check for read/write/execute permissions on a file. We use “fsuid” for this, letting us set arbitrary permissions for filesystem access without changing the “normal” uids which are used for other things.
generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk request cannot be satisfied (eg. requires blocking or too much complexity). It would then be called again in ref-walk mode.
inode_permission — check for access rights to a given inode
int inode_permission ( | inode, | |
mask) ; |
struct inode * inode
;int mask
;file_permission — check for additional access rights to a given file
int file_permission ( | file, | |
mask) ; |
struct file * file
;int mask
;vfs_path_lookup — lookup a file path relative to a dentry-vfsmount pair
int vfs_path_lookup ( | dentry, | |
mnt, | ||
name, | ||
flags, | ||
nd) ; |
struct dentry * dentry
;struct vfsmount * mnt
;const char * name
;unsigned int flags
;struct nameidata * nd
;lookup_one_len — filesystem helper to lookup single pathname component
struct dentry * lookup_one_len ( | name, | |
base, | ||
len) ; |
const char * name
;struct dentry * base
;int len
;lookup_create — lookup a dentry, creating it if it doesn't exist
struct dentry * lookup_create ( | nd, | |
is_dir) ; |
struct nameidata * nd
;int is_dir
;sync_mapping_buffers — write out & wait upon a mapping's “associated” buffers
int sync_mapping_buffers ( | mapping) ; |
struct address_space * mapping
;mark_buffer_dirty — mark a buffer_head as needing writeout
void mark_buffer_dirty ( | bh) ; |
struct buffer_head * bh
;
mark_buffer_dirty
will set the dirty bit against the buffer, then set its
backing page dirty, then tag the page as dirty in its address_space's radix
tree and then attach the address_space's inode to its superblock's dirty
inode list.
mark_buffer_dirty
is atomic. It takes bh->b_page->mapping->private_lock,
mapping->tree_lock and mapping->host->i_lock.
__bread — reads a specified block and returns the bh
struct buffer_head * __bread ( | bdev, | |
block, | ||
size) ; |
struct block_device * bdev
;sector_t block
;unsigned size
;block_invalidatepage — invalidate part of all of a buffer-backed page
void block_invalidatepage ( | page, | |
offset) ; |
struct page * page
;unsigned long offset
;
block_invalidatepage
is called when all or part of the page has become
invalidatedby a truncate operation.
block_invalidatepage
does not have to release all buffers, but it must
ensure that no dirty buffer is left outside offset
and that no I/O
is underway against any of the blocks which are outside the truncation
point. Because the caller is about to free (and possibly reuse) those
blocks on-disk.
ll_rw_block — level access to block devices (DEPRECATED)
void ll_rw_block ( | rw, | |
nr, | ||
bhs[]) ; |
int rw
;int nr
;struct buffer_head * bhs[]
;rw
whether to READ
or WRITE
or maybe READA
(readahead)
nr
number of struct buffer_heads in the array
bhs[]
array of pointers to struct buffer_head
ll_rw_block
takes an array of pointers to struct buffer_heads, and
requests an I/O operation on them, either a READ
or a WRITE
. The third
READA
option is described in the documentation for generic_make_request
which ll_rw_block
calls.
This function drops any buffer that it cannot get a lock on (with the BH_Lock state bit), any buffer that appears to be clean when doing a write request, and any buffer that appears to be up-to-date when doing read request. Further it marks as clean buffers that are processed for writing (the buffer cache won't assume that they are actually clean until the buffer gets unlocked).
ll_rw_block sets b_end_io to simple completion handler that marks the buffer up-to-date (if approriate), unlocks the buffer and wakes any waiters.
All of the buffers must be for the same device, and must also be a multiple of the current approved size for the device.
bh_uptodate_or_lock — Test whether the buffer is uptodate
int bh_uptodate_or_lock ( | bh) ; |
struct buffer_head * bh
;bh_submit_read — Submit a locked buffer for reading
int bh_submit_read ( | bh) ; |
struct buffer_head * bh
;bio_alloc_bioset — allocate a bio for I/O
struct bio * bio_alloc_bioset ( | gfp_mask, | |
nr_iovecs, | ||
bs) ; |
gfp_t gfp_mask
;int nr_iovecs
;struct bio_set * bs
;gfp_mask
the GFP_ mask given to the slab allocator
nr_iovecs
number of iovecs to pre-allocate
bs
the bio_set to allocate from.
bio_alloc_bioset will try its own mempool to satisfy the allocation.
If __GFP_WAIT
is set then we will block on the internal pool waiting
for a struct bio to become free.
Note that the caller must set ->bi_destructor on successful return of a bio, to do the appropriate freeing of the bio once the reference count drops to zero.
bio_alloc — allocate a new bio, memory pool backed
struct bio * bio_alloc ( | gfp_mask, | |
nr_iovecs) ; |
gfp_t gfp_mask
;int nr_iovecs
;
bio_alloc will allocate a bio and associated bio_vec array that can hold
at least nr_iovecs
entries. Allocations will be done from the
fs_bio_set. Also see bio_alloc_bioset
and bio_kmalloc
.
If __GFP_WAIT
is set, then bio_alloc will always be able to allocate
a bio. This is due to the mempool guarantees. To make this work, callers
must never allocate more than 1 bio at a time from this pool. Callers
that need to allocate more than 1 bio must always submit the previously
allocated bio for IO before attempting to allocate a new one. Failure to
do so can cause livelocks under memory pressure.
bio_kmalloc —
allocate a bio for I/O using kmalloc
struct bio * bio_kmalloc ( | gfp_mask, | |
nr_iovecs) ; |
gfp_t gfp_mask
;int nr_iovecs
;__bio_clone — clone a bio
void __bio_clone ( | bio, | |
bio_src) ; |
struct bio * bio
;struct bio * bio_src
;bio_clone — clone a bio
struct bio * bio_clone ( | bio, | |
gfp_mask) ; |
struct bio * bio
;gfp_t gfp_mask
;bio_get_nr_vecs — return approx number of vecs
int bio_get_nr_vecs ( | bdev) ; |
struct block_device * bdev
;bio_add_pc_page — attempt to add page to bio
int bio_add_pc_page ( | q, | |
bio, | ||
page, | ||
len, | ||
offset) ; |
struct request_queue * q
;struct bio * bio
;struct page * page
;unsigned int len
;unsigned int offset
;q
the target queue
bio
destination bio
page
page to add
len
vec entry length
offset
vec entry offset
Attempt to add a page to the bio_vec maplist. This can fail for a number of reasons, such as the bio being full or target block device limitations. The target block device must allow bio's up to PAGE_SIZE, so it is always possible to add a single page to an empty bio.
This should only be used by REQ_PC bios.
bio_add_page — attempt to add page to bio
int bio_add_page ( | bio, | |
page, | ||
len, | ||
offset) ; |
struct bio * bio
;struct page * page
;unsigned int len
;unsigned int offset
;bio_uncopy_user — finish previously mapped bio
int bio_uncopy_user ( | bio) ; |
struct bio * bio
;bio_copy_user — copy user data to bio
struct bio * bio_copy_user ( | q, | |
map_data, | ||
uaddr, | ||
len, | ||
write_to_vm, | ||
gfp_mask) ; |
struct request_queue * q
;struct rq_map_data * map_data
;unsigned long uaddr
;unsigned int len
;int write_to_vm
;gfp_t gfp_mask
;bio_map_user — map user address into bio
struct bio * bio_map_user ( | q, | |
bdev, | ||
uaddr, | ||
len, | ||
write_to_vm, | ||
gfp_mask) ; |
struct request_queue * q
;struct block_device * bdev
;unsigned long uaddr
;unsigned int len
;int write_to_vm
;gfp_t gfp_mask
;bio_map_kern — map kernel address into bio
struct bio * bio_map_kern ( | q, | |
data, | ||
len, | ||
gfp_mask) ; |
struct request_queue * q
;void * data
;unsigned int len
;gfp_t gfp_mask
;bio_copy_kern — copy kernel address into bio
struct bio * bio_copy_kern ( | q, | |
data, | ||
len, | ||
gfp_mask, | ||
reading) ; |
struct request_queue * q
;void * data
;unsigned int len
;gfp_t gfp_mask
;int reading
;bio_endio — end I/O on a bio
void bio_endio ( | bio, | |
error) ; |
struct bio * bio
;int error
;
bio_endio
will end I/O on the whole bio. bio_endio
is the
preferred way to end I/O on a bio, it takes care of clearing
BIO_UPTODATE on error. error
is 0 on success, and and one of the
established -Exxxx (-EIO, for instance) error values in case
something went wrong. No one should call bi_end_io
directly on a
bio unless they own it and thus know that it has an end_io
function.
bio_sector_offset — Find hardware sector offset in bio
sector_t bio_sector_offset ( | bio, | |
index, | ||
offset) ; |
struct bio * bio
;unsigned short index
;unsigned int offset
;bioset_create — Create a bio_set
struct bio_set * bioset_create ( | pool_size, | |
front_pad) ; |
unsigned int pool_size
;unsigned int front_pad
;pool_size
Number of bio and bio_vecs to cache in the mempool
front_pad
Number of bytes to allocate in front of the returned bio
Set up a bio_set to be used with bio_alloc_bioset
. Allows the caller
to ask for a number of bytes to be allocated in front of the bio.
Front pad allocation is useful for embedding the bio inside
another structure, to avoid allocating extra data to go with the bio.
Note that the bio must be embedded at the END of that structure always,
or things will break badly.
seq_open — initialize sequential file
int seq_open ( | file, | |
op) ; |
struct file * file
;const struct seq_operations * op
;
seq_open
sets file
, associating it with a sequence described
by op
. op
->start
sets the iterator up and returns the first
element of sequence. op
->stop
shuts it down. op
->next
returns the next element of sequence. op
->show
prints element
into the buffer. In case of error ->start
and ->next
return
ERR_PTR(error). In the end of sequence they return NULL
. ->show
returns 0 in case of success and negative number in case of error.
Returning SEQ_SKIP means “discard this element and move on”.
seq_read —
->read
method for sequential files.
ssize_t seq_read ( | file, | |
buf, | ||
size, | ||
ppos) ; |
struct file * file
;char __user * buf
;size_t size
;loff_t * ppos
;seq_lseek —
->llseek
method for sequential files.
loff_t seq_lseek ( | file, | |
offset, | ||
origin) ; |
struct file * file
;loff_t offset
;int origin
;seq_release — free the structures associated with sequential file.
int seq_release ( | inode, | |
file) ; |
struct inode * inode
;struct file * file
;seq_escape — print string into buffer, escaping some characters
int seq_escape ( | m, | |
s, | ||
esc) ; |
struct seq_file * m
;const char * s
;const char * esc
;mangle_path — mangle and copy path to buffer beginning
char * mangle_path ( | s, | |
p, | ||
esc) ; |
char * s
;char * p
;char * esc
;seq_path — seq_file interface to print a pathname
int seq_path ( | m, | |
path, | ||
esc) ; |
struct seq_file * m
;struct path * path
;char * esc
;seq_write — write arbitrary data to buffer
int seq_write ( | seq, | |
data, | ||
len) ; |
struct seq_file * seq
;const void * data
;size_t len
;seq_hlist_start — start an iteration of a hlist
struct hlist_node * seq_hlist_start ( | head, | |
pos) ; |
struct hlist_head * head
;loff_t pos
;seq_hlist_start_head — start an iteration of a hlist
struct hlist_node * seq_hlist_start_head ( | head, | |
pos) ; |
struct hlist_head * head
;loff_t pos
;seq_hlist_next — move to the next position of the hlist
struct hlist_node * seq_hlist_next ( | v, | |
head, | ||
ppos) ; |
void * v
;struct hlist_head * head
;loff_t * ppos
;seq_hlist_start_rcu — start an iteration of a hlist protected by RCU
struct hlist_node * seq_hlist_start_rcu ( | head, | |
pos) ; |
struct hlist_head * head
;loff_t pos
;seq_hlist_start_head_rcu — start an iteration of a hlist protected by RCU
struct hlist_node * seq_hlist_start_head_rcu ( | head, | |
pos) ; |
struct hlist_head * head
;loff_t pos
;seq_hlist_next_rcu — move to the next position of the hlist protected by RCU
struct hlist_node * seq_hlist_next_rcu ( | v, | |
head, | ||
ppos) ; |
void * v
;struct hlist_head * head
;loff_t * ppos
;register_filesystem — register a new filesystem
int register_filesystem ( | fs) ; |
struct file_system_type * fs
;Adds the file system passed to the list of file systems the kernel is aware of for mount and other syscalls. Returns 0 on success, or a negative errno code on an error.
The struct file_system_type that is passed is linked into the kernel structures and must not be freed until the file system has been unregistered.
unregister_filesystem — unregister a file system
int unregister_filesystem ( | fs) ; |
struct file_system_type * fs
;__mark_inode_dirty — internal function
void __mark_inode_dirty ( | inode, | |
flags) ; |
struct inode * inode
;int flags
;inode
inode to mark
flags
what kind of dirty (i.e. I_DIRTY_SYNC) Mark an inode as dirty. Callers should use mark_inode_dirty or mark_inode_dirty_sync.
Put the inode on the super block's dirty list.
CAREFUL! We mark it dirty unconditionally, but move it onto the dirty list only if it is hashed or if it refers to a blockdev. If it was not hashed, it will never be added to the dirty list even if it is later hashed, as it will have been marked dirty already.
In short, make sure you hash any inodes _before_ you start marking them dirty.
Note that for blockdevs, inode->dirtied_when represents the dirtying time of the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of the kernel-internal blockdev inode represents the dirtying time of the blockdev's pages. This is why for I_DIRTY_PAGES we always use page->mapping->host, so the page-dirtying time is recorded in the internal blockdev inode.
writeback_inodes_sb_nr — writeback dirty inodes from given super_block
void writeback_inodes_sb_nr ( | sb, | |
nr) ; |
struct super_block * sb
;unsigned long nr
;writeback_inodes_sb — writeback dirty inodes from given super_block
void writeback_inodes_sb ( | sb) ; |
struct super_block * sb
;writeback_inodes_sb_if_idle — start writeback if none underway
int writeback_inodes_sb_if_idle ( | sb) ; |
struct super_block * sb
;writeback_inodes_sb_nr_if_idle — start writeback if none underway
int writeback_inodes_sb_nr_if_idle ( | sb, | |
nr) ; |
struct super_block * sb
;unsigned long nr
;write_inode_now — write an inode to disk
int write_inode_now ( | inode, | |
sync) ; |
struct inode * inode
;int sync
;sync_inode — write an inode and its pages to disk.
int sync_inode ( | inode, | |
wbc) ; |
struct inode * inode
;struct writeback_control * wbc
;sync_inode_metadata — write an inode to disk
int sync_inode_metadata ( | inode, | |
wait) ; |
struct inode * inode
;int wait
;freeze_bdev — - lock a filesystem and force it into a consistent state
struct super_block * freeze_bdev ( | bdev) ; |
struct block_device * bdev
;
If a superblock is found on this device, we take the s_umount semaphore
on it to make sure nobody unmounts until the snapshot creation is done.
The reference counter (bd_fsfreeze_count) guarantees that only the last
unfreeze process can unfreeze the frozen filesystem actually when multiple
freeze requests arrive simultaneously. It counts up in freeze_bdev
and
count down in thaw_bdev
. When it becomes 0, thaw_bdev
will unfreeze
actually.
thaw_bdev — - unlock filesystem
int thaw_bdev ( | bdev, | |
sb) ; |
struct block_device * bdev
;struct super_block * sb
;bd_link_disk_holder — create symlinks between holding disk and slave bdev
int bd_link_disk_holder ( | bdev, | |
disk) ; |
struct block_device * bdev
;struct gendisk * disk
;DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
This functions creates the following sysfs symlinks.
- from “slaves” directory of the holder disk
to the claimed bdev
- from “holders” directory of the bdev
to the holder disk
For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
passed to bd_link_disk_holder
, then:
/sys/block/dm-0/slaves/sda --> /sys/block/sda /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
The caller must have claimed bdev
before calling this function and
ensure that both bdev
and disk
are valid during the creation and
lifetime of these symlinks.
bd_unlink_disk_holder —
destroy symlinks created by bd_link_disk_holder
void bd_unlink_disk_holder ( | bdev, | |
disk) ; |
struct block_device * bdev
;struct gendisk * disk
;check_disk_size_change — checks for disk size change and adjusts bdev size.
void check_disk_size_change ( | disk, | |
bdev) ; |
struct gendisk * disk
;struct block_device * bdev
;revalidate_disk — wrapper for lower-level driver's revalidate_disk call-back
int revalidate_disk ( | disk) ; |
struct gendisk * disk
;blkdev_get — open a block device
int blkdev_get ( | bdev, | |
mode, | ||
holder) ; |
struct block_device * bdev
;fmode_t mode
;void * holder
;blkdev_get_by_path — open a block device by name
struct block_device * blkdev_get_by_path ( | path, | |
mode, | ||
holder) ; |
const char * path
;fmode_t mode
;void * holder
;blkdev_get_by_dev — open a block device by device number
struct block_device * blkdev_get_by_dev ( | dev, | |
mode, | ||
holder) ; |
dev_t dev
;fmode_t mode
;void * holder
;dev
device number of block device to open
mode
FMODE_* mask
holder
exclusive holder identifier
Open the blockdevice described by device number dev
. mode
and
holder
are identical to blkdev_get
.
Use it ONLY if you really do not have anything better - i.e. when you are behind a truly sucky interface and all you are given is a device number. _Never_ to be used for internal purposes. If you ever need it - reconsider your API.
On success, the returned block_device has reference count of one.
Table of Contents
register_sysctl_paths — register a sysctl table hierarchy
struct ctl_table_header * register_sysctl_paths ( | path, | |
table) ; |
const struct ctl_path * path
;struct ctl_table * table
;register_sysctl_table — register a sysctl table hierarchy
struct ctl_table_header * register_sysctl_table ( | table) ; |
struct ctl_table * table
;unregister_sysctl_table — unregister a sysctl table hierarchy
void unregister_sysctl_table ( | header) ; |
struct ctl_table_header * header
;proc_dostring — read a string sysctl
int proc_dostring ( | table, | |
write, | ||
buffer, | ||
lenp, | ||
ppos) ; |
struct ctl_table * table
;int write
;void __user * buffer
;size_t * lenp
;loff_t * ppos
;table
the sysctl table
write
TRUE
if this is a write to the sysctl file
buffer
the user buffer
lenp
the size of the user buffer
ppos
file position
Reads/writes a string from/to the user buffer. If the kernel
buffer provided is not large enough to hold the string, the
string is truncated. The copied string is NULL-terminated
.
If the string is being read by the user process, it is copied
and a newline '\n' is added. It is truncated if the buffer is
not large enough.
Returns 0 on success.
proc_dointvec — read a vector of integers
int proc_dointvec ( | table, | |
write, | ||
buffer, | ||
lenp, | ||
ppos) ; |
struct ctl_table * table
;int write
;void __user * buffer
;size_t * lenp
;loff_t * ppos
;proc_dointvec_minmax — read a vector of integers with min/max values
int proc_dointvec_minmax ( | table, | |
write, | ||
buffer, | ||
lenp, | ||
ppos) ; |
struct ctl_table * table
;int write
;void __user * buffer
;size_t * lenp
;loff_t * ppos
;proc_doulongvec_minmax — read a vector of long integers with min/max values
int proc_doulongvec_minmax ( | table, | |
write, | ||
buffer, | ||
lenp, | ||
ppos) ; |
struct ctl_table * table
;int write
;void __user * buffer
;size_t * lenp
;loff_t * ppos
;proc_doulongvec_ms_jiffies_minmax — read a vector of millisecond values with min/max values
int proc_doulongvec_ms_jiffies_minmax ( | table, | |
write, | ||
buffer, | ||
lenp, | ||
ppos) ; |
struct ctl_table * table
;int write
;void __user * buffer
;size_t * lenp
;loff_t * ppos
;table
the sysctl table
write
TRUE
if this is a write to the sysctl file
buffer
the user buffer
lenp
the size of the user buffer
ppos
file position
Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long values from/to the user buffer, treated as an ASCII string. The values are treated as milliseconds, and converted to jiffies when they are stored.
This routine will ensure the values are within the range specified by table->extra1 (min) and table->extra2 (max).
Returns 0 on success.
proc_dointvec_jiffies — read a vector of integers as seconds
int proc_dointvec_jiffies ( | table, | |
write, | ||
buffer, | ||
lenp, | ||
ppos) ; |
struct ctl_table * table
;int write
;void __user * buffer
;size_t * lenp
;loff_t * ppos
;proc_dointvec_userhz_jiffies — read a vector of integers as 1/USER_HZ seconds
int proc_dointvec_userhz_jiffies ( | table, | |
write, | ||
buffer, | ||
lenp, | ||
ppos) ; |
struct ctl_table * table
;int write
;void __user * buffer
;size_t * lenp
;loff_t * ppos
;proc_dointvec_ms_jiffies — read a vector of integers as 1 milliseconds
int proc_dointvec_ms_jiffies ( | table, | |
write, | ||
buffer, | ||
lenp, | ||
ppos) ; |
struct ctl_table * table
;int write
;void __user * buffer
;size_t * lenp
;loff_t * ppos
;proc_flush_task —
Remove dcache entries for task
from the /proc dcache.
void proc_flush_task ( | task) ; |
struct task_struct * task
;When flushing dentries from proc, one needs to flush them from global proc (proc_mnt) and from all the namespaces' procs this task was seen in. This call is supposed to do all of this job.
Looks in the dcache for
/proc/pid
/proc/tgid
/task/pid
if either directory is present flushes it and all of it'ts children
from the dcache.
It is safe and reasonable to cache /proc entries for a task until that task exits. After that they just clog up the dcache with useless entries, possibly causing useful dcache entries to be flushed instead. This routine is proved to flush those useless dcache entries at process exit time.
Table of Contents
n
to the eventfd counter.
eventfd_signal —
Adds n
to the eventfd counter.
int eventfd_signal ( | ctx, | |
n) ; |
struct eventfd_ctx * ctx
;int n
;ctx
[in] Pointer to the eventfd context.
n
[in] Value of the counter to be added to the eventfd internal counter. The value cannot be negative.
This function is supposed to be called by the kernel in paths that do not allow sleeping. In this function we allow the counter to reach the ULLONG_MAX value, and we signal this as overflow condition by returining a POLLERR to poll(2).
Returns n
in case of success, a non-negative number lower than n
in case
of overflow, or the following error codes:
-EINVAL : The value of n
is negative.
eventfd_ctx_get — Acquires a reference to the internal eventfd context.
struct eventfd_ctx * eventfd_ctx_get ( | ctx) ; |
struct eventfd_ctx * ctx
;eventfd_ctx_put — Releases a reference to the internal eventfd context.
void eventfd_ctx_put ( | ctx) ; |
struct eventfd_ctx * ctx
;eventfd_ctx_remove_wait_queue — Read the current counter and removes wait queue.
int eventfd_ctx_remove_wait_queue ( | ctx, | |
wait, | ||
cnt) ; |
struct eventfd_ctx * ctx
;wait_queue_t * wait
;__u64 * cnt
;eventfd_ctx_read — Reads the eventfd counter or wait if it is zero.
ssize_t eventfd_ctx_read ( | ctx, | |
no_wait, | ||
cnt) ; |
struct eventfd_ctx * ctx
;int no_wait
;__u64 * cnt
;eventfd_fget — Acquire a reference of an eventfd file descriptor.
struct file * eventfd_fget ( | fd) ; |
int fd
;eventfd_ctx_fdget — Acquires a reference to the internal eventfd context.
struct eventfd_ctx * eventfd_ctx_fdget ( | fd) ; |
int fd
;Table of Contents
sysfs_create_file — create an attribute file for an object.
int sysfs_create_file ( | kobj, | |
attr) ; |
struct kobject * kobj
;const struct attribute * attr
;sysfs_add_file_to_group — add an attribute file to a pre-existing group.
int sysfs_add_file_to_group ( | kobj, | |
attr, | ||
group) ; |
struct kobject * kobj
;const struct attribute * attr
;const char * group
;sysfs_chmod_file — update the modified mode value on an object attribute.
int sysfs_chmod_file ( | kobj, | |
attr, | ||
mode) ; |
struct kobject * kobj
;const struct attribute * attr
;mode_t mode
;sysfs_remove_file — remove an object attribute.
void sysfs_remove_file ( | kobj, | |
attr) ; |
struct kobject * kobj
;const struct attribute * attr
;sysfs_remove_file_from_group — remove an attribute file from a group.
void sysfs_remove_file_from_group ( | kobj, | |
attr, | ||
group) ; |
struct kobject * kobj
;const struct attribute * attr
;const char * group
;sysfs_schedule_callback — helper to schedule a callback for a kobject
int sysfs_schedule_callback ( | kobj, | |
func, | ||
data, | ||
owner) ; |
struct kobject * kobj
;void (*func)
(
void *)
;void * data
;struct module * owner
;kobj
object we're acting for.
func
callback function to invoke later.
data
argument to pass to func
.
owner
module owning the callback code
sysfs attribute methods must not unregister themselves or their parent kobject (which would amount to the same thing). Attempts to do so will deadlock, since unregistration is mutually exclusive with driver callbacks.
Instead methods can call this routine, which will attempt to allocate
and schedule a workqueue request to call back func
with data
as its
argument in the workqueue's process context. kobj
will be pinned
until func
returns.
Returns 0 if the request was submitted, -ENOMEM if storage could not
be allocated, -ENODEV if a reference to owner
isn't available,
-EAGAIN if a callback has already been scheduled for kobj
.
sysfs_create_link — create symlink between two objects.
int sysfs_create_link ( | kobj, | |
target, | ||
name) ; |
struct kobject * kobj
;struct kobject * target
;const char * name
;sysfs_remove_link — remove symlink in object's directory.
void sysfs_remove_link ( | kobj, | |
name) ; |
struct kobject * kobj
;const char * name
;sysfs_rename_link — rename symlink in object's directory.
int sysfs_rename_link ( | kobj, | |
targ, | ||
old, | ||
new) ; |
struct kobject * kobj
;struct kobject * targ
;const char * old
;const char * new
;Table of Contents
debugfs_create_file — create a file in the debugfs filesystem
struct dentry * debugfs_create_file ( | name, | |
mode, | ||
parent, | ||
data, | ||
fops) ; |
const char * name
;mode_t mode
;struct dentry * parent
;void * data
;const struct file_operations * fops
;name
a pointer to a string containing the name of the file to create.
mode
the permission that the file should have.
parent
a pointer to the parent dentry for this file. This should be a directory dentry if set. If this paramater is NULL, then the file will be created in the root of the debugfs filesystem.
data
a pointer to something that the caller will want to get to later
on. The inode.i_private pointer will point to this value on
the open
call.
fops
a pointer to a struct file_operations that should be used for this file.
This is the basic “create a file” function for debugfs. It allows for a
wide range of flexibility in creating a file, or a directory (if you want
to create a directory, the debugfs_create_dir
function is
recommended to be used instead.)
This function will return a pointer to a dentry if it succeeds. This
pointer must be passed to the debugfs_remove
function when the file is
to be removed (no automatic cleanup happens if your module is unloaded,
you are responsible here.) If an error occurs, NULL
will be returned.
If debugfs is not enabled in the kernel, the value -ENODEV
will be
returned.
debugfs_create_dir — create a directory in the debugfs filesystem
struct dentry * debugfs_create_dir ( | name, | |
parent) ; |
const char * name
;struct dentry * parent
;name
a pointer to a string containing the name of the directory to create.
parent
a pointer to the parent dentry for this file. This should be a directory dentry if set. If this paramater is NULL, then the directory will be created in the root of the debugfs filesystem.
This function creates a directory in debugfs with the given name.
This function will return a pointer to a dentry if it succeeds. This
pointer must be passed to the debugfs_remove
function when the file is
to be removed (no automatic cleanup happens if your module is unloaded,
you are responsible here.) If an error occurs, NULL
will be returned.
If debugfs is not enabled in the kernel, the value -ENODEV
will be
returned.
debugfs_create_symlink — create a symbolic link in the debugfs filesystem
struct dentry * debugfs_create_symlink ( | name, | |
parent, | ||
target) ; |
const char * name
;struct dentry * parent
;const char * target
;name
a pointer to a string containing the name of the symbolic link to create.
parent
a pointer to the parent dentry for this symbolic link. This should be a directory dentry if set. If this paramater is NULL, then the symbolic link will be created in the root of the debugfs filesystem.
target
a pointer to a string containing the path to the target of the symbolic link.
This function creates a symbolic link with the given name in debugfs that links to the given target path.
This function will return a pointer to a dentry if it succeeds. This
pointer must be passed to the debugfs_remove
function when the symbolic
link is to be removed (no automatic cleanup happens if your module is
unloaded, you are responsible here.) If an error occurs, NULL
will be
returned.
If debugfs is not enabled in the kernel, the value -ENODEV
will be
returned.
debugfs_remove — removes a file or directory from the debugfs filesystem
void debugfs_remove ( | dentry) ; |
struct dentry * dentry
;
This function removes a file or directory in debugfs that was previously
created with a call to another debugfs function (like
debugfs_create_file
or variants thereof.)
This function is required to be called in order for the file to be removed, no automatic cleanup of files will happen when a module is removed, you are responsible here.
debugfs_remove_recursive — recursively removes a directory
void debugfs_remove_recursive ( | dentry) ; |
struct dentry * dentry
;
This function recursively removes a directory tree in debugfs that
was previously created with a call to another debugfs function
(like debugfs_create_file
or variants thereof.)
This function is required to be called in order for the file to be removed, no automatic cleanup of files will happen when a module is removed, you are responsible here.
debugfs_rename — rename a file/directory in the debugfs filesystem
struct dentry * debugfs_rename ( | old_dir, | |
old_dentry, | ||
new_dir, | ||
new_name) ; |
struct dentry * old_dir
;struct dentry * old_dentry
;struct dentry * new_dir
;const char * new_name
;old_dir
a pointer to the parent dentry for the renamed object. This should be a directory dentry.
old_dentry
dentry of an object to be renamed.
new_dir
a pointer to the parent dentry where the object should be moved. This should be a directory dentry.
new_name
a pointer to a string containing the target name.
This function renames a file/directory in debugfs. The target must not exist for rename to succeed.
This function will return a pointer to old_dentry (which is updated to
reflect renaming) if it succeeds. If an error occurs, NULL
will be
returned.
If debugfs is not enabled in the kernel, the value -ENODEV
will be
returned.
debugfs_initialized — Tells whether debugfs has been registered
bool debugfs_initialized ( | void) ; |
void
;debugfs_create_u8 — create a debugfs file that is used to read and write an unsigned 8-bit value
struct dentry * debugfs_create_u8 ( | name, | |
mode, | ||
parent, | ||
value) ; |
const char * name
;mode_t mode
;struct dentry * parent
;u8 * value
;name
a pointer to a string containing the name of the file to create.
mode
the permission that the file should have
parent
a pointer to the parent dentry for this file. This should be a
directory dentry if set. If this parameter is NULL
, then the
file will be created in the root of the debugfs filesystem.
value
a pointer to the variable that the file should read to and write from.
This function creates a file in debugfs with the given name that
contains the value of the variable value
. If the mode
variable is so
set, it can be read from, and written to.
This function will return a pointer to a dentry if it succeeds. This
pointer must be passed to the debugfs_remove
function when the file is
to be removed (no automatic cleanup happens if your module is unloaded,
you are responsible here.) If an error occurs, NULL
will be returned.
If debugfs is not enabled in the kernel, the value -ENODEV
will be
returned. It is not wise to check for this value, but rather, check for
NULL
or !NULL
instead as to eliminate the need for #ifdef in the calling
code.
debugfs_create_u16 — create a debugfs file that is used to read and write an unsigned 16-bit value
struct dentry * debugfs_create_u16 ( | name, | |
mode, | ||
parent, | ||
value) ; |
const char * name
;mode_t mode
;struct dentry * parent
;u16 * value
;name
a pointer to a string containing the name of the file to create.
mode
the permission that the file should have
parent
a pointer to the parent dentry for this file. This should be a
directory dentry if set. If this parameter is NULL
, then the
file will be created in the root of the debugfs filesystem.
value
a pointer to the variable that the file should read to and write from.
This function creates a file in debugfs with the given name that
contains the value of the variable value
. If the mode
variable is so
set, it can be read from, and written to.
This function will return a pointer to a dentry if it succeeds. This
pointer must be passed to the debugfs_remove
function when the file is
to be removed (no automatic cleanup happens if your module is unloaded,
you are responsible here.) If an error occurs, NULL
will be returned.
If debugfs is not enabled in the kernel, the value -ENODEV
will be
returned. It is not wise to check for this value, but rather, check for
NULL
or !NULL
instead as to eliminate the need for #ifdef in the calling
code.
debugfs_create_u32 — create a debugfs file that is used to read and write an unsigned 32-bit value
struct dentry * debugfs_create_u32 ( | name, | |
mode, | ||
parent, | ||
value) ; |
const char * name
;mode_t mode
;struct dentry * parent
;u32 * value
;name
a pointer to a string containing the name of the file to create.
mode
the permission that the file should have
parent
a pointer to the parent dentry for this file. This should be a
directory dentry if set. If this parameter is NULL
, then the
file will be created in the root of the debugfs filesystem.
value
a pointer to the variable that the file should read to and write from.
This function creates a file in debugfs with the given name that
contains the value of the variable value
. If the mode
variable is so
set, it can be read from, and written to.
This function will return a pointer to a dentry if it succeeds. This
pointer must be passed to the debugfs_remove
function when the file is
to be removed (no automatic cleanup happens if your module is unloaded,
you are responsible here.) If an error occurs, NULL
will be returned.
If debugfs is not enabled in the kernel, the value -ENODEV
will be
returned. It is not wise to check for this value, but rather, check for
NULL
or !NULL
instead as to eliminate the need for #ifdef in the calling
code.
debugfs_create_u64 — create a debugfs file that is used to read and write an unsigned 64-bit value
struct dentry * debugfs_create_u64 ( | name, | |
mode, | ||
parent, | ||
value) ; |
const char * name
;mode_t mode
;struct dentry * parent
;u64 * value
;name
a pointer to a string containing the name of the file to create.
mode
the permission that the file should have
parent
a pointer to the parent dentry for this file. This should be a
directory dentry if set. If this parameter is NULL
, then the
file will be created in the root of the debugfs filesystem.
value
a pointer to the variable that the file should read to and write from.
This function creates a file in debugfs with the given name that
contains the value of the variable value
. If the mode
variable is so
set, it can be read from, and written to.
This function will return a pointer to a dentry if it succeeds. This
pointer must be passed to the debugfs_remove
function when the file is
to be removed (no automatic cleanup happens if your module is unloaded,
you are responsible here.) If an error occurs, NULL
will be returned.
If debugfs is not enabled in the kernel, the value -ENODEV
will be
returned. It is not wise to check for this value, but rather, check for
NULL
or !NULL
instead as to eliminate the need for #ifdef in the calling
code.
debugfs_create_x8 — create a debugfs file that is used to read and write an unsigned 8-bit value
struct dentry * debugfs_create_x8 ( | name, | |
mode, | ||
parent, | ||
value) ; |
const char * name
;mode_t mode
;struct dentry * parent
;u8 * value
;name
a pointer to a string containing the name of the file to create.
mode
the permission that the file should have
parent
a pointer to the parent dentry for this file. This should be a
directory dentry if set. If this parameter is NULL
, then the
file will be created in the root of the debugfs filesystem.
value
a pointer to the variable that the file should read to and write from.
debugfs_create_x16 — create a debugfs file that is used to read and write an unsigned 16-bit value
struct dentry * debugfs_create_x16 ( | name, | |
mode, | ||
parent, | ||
value) ; |
const char * name
;mode_t mode
;struct dentry * parent
;u16 * value
;name
a pointer to a string containing the name of the file to create.
mode
the permission that the file should have
parent
a pointer to the parent dentry for this file. This should be a
directory dentry if set. If this parameter is NULL
, then the
file will be created in the root of the debugfs filesystem.
value
a pointer to the variable that the file should read to and write from.
debugfs_create_x32 — create a debugfs file that is used to read and write an unsigned 32-bit value
struct dentry * debugfs_create_x32 ( | name, | |
mode, | ||
parent, | ||
value) ; |
const char * name
;mode_t mode
;struct dentry * parent
;u32 * value
;name
a pointer to a string containing the name of the file to create.
mode
the permission that the file should have
parent
a pointer to the parent dentry for this file. This should be a
directory dentry if set. If this parameter is NULL
, then the
file will be created in the root of the debugfs filesystem.
value
a pointer to the variable that the file should read to and write from.
debugfs_create_x64 — create a debugfs file that is used to read and write an unsigned 64-bit value
struct dentry * debugfs_create_x64 ( | name, | |
mode, | ||
parent, | ||
value) ; |
const char * name
;mode_t mode
;struct dentry * parent
;u64 * value
;name
a pointer to a string containing the name of the file to create.
mode
the permission that the file should have
parent
a pointer to the parent dentry for this file. This should be a
directory dentry if set. If this parameter is NULL
, then the
file will be created in the root of the debugfs filesystem.
value
a pointer to the variable that the file should read to and write from.
debugfs_create_size_t — create a debugfs file that is used to read and write an size_t value
struct dentry * debugfs_create_size_t ( | name, | |
mode, | ||
parent, | ||
value) ; |
const char * name
;mode_t mode
;struct dentry * parent
;size_t * value
;name
a pointer to a string containing the name of the file to create.
mode
the permission that the file should have
parent
a pointer to the parent dentry for this file. This should be a
directory dentry if set. If this parameter is NULL
, then the
file will be created in the root of the debugfs filesystem.
value
a pointer to the variable that the file should read to and write from.
debugfs_create_bool — create a debugfs file that is used to read and write a boolean value
struct dentry * debugfs_create_bool ( | name, | |
mode, | ||
parent, | ||
value) ; |
const char * name
;mode_t mode
;struct dentry * parent
;u32 * value
;name
a pointer to a string containing the name of the file to create.
mode
the permission that the file should have
parent
a pointer to the parent dentry for this file. This should be a
directory dentry if set. If this parameter is NULL
, then the
file will be created in the root of the debugfs filesystem.
value
a pointer to the variable that the file should read to and write from.
This function creates a file in debugfs with the given name that
contains the value of the variable value
. If the mode
variable is so
set, it can be read from, and written to.
This function will return a pointer to a dentry if it succeeds. This
pointer must be passed to the debugfs_remove
function when the file is
to be removed (no automatic cleanup happens if your module is unloaded,
you are responsible here.) If an error occurs, NULL
will be returned.
If debugfs is not enabled in the kernel, the value -ENODEV
will be
returned. It is not wise to check for this value, but rather, check for
NULL
or !NULL
instead as to eliminate the need for #ifdef in the calling
code.
debugfs_create_blob — create a debugfs file that is used to read a binary blob
struct dentry * debugfs_create_blob ( | name, | |
mode, | ||
parent, | ||
blob) ; |
const char * name
;mode_t mode
;struct dentry * parent
;struct debugfs_blob_wrapper * blob
;name
a pointer to a string containing the name of the file to create.
mode
the permission that the file should have
parent
a pointer to the parent dentry for this file. This should be a
directory dentry if set. If this parameter is NULL
, then the
file will be created in the root of the debugfs filesystem.
blob
a pointer to a struct debugfs_blob_wrapper which contains a pointer to the blob data and the size of the data.
This function creates a file in debugfs with the given name that exports
blob
->data as a binary blob. If the mode
variable is so set it can be
read from. Writing is not supported.
This function will return a pointer to a dentry if it succeeds. This
pointer must be passed to the debugfs_remove
function when the file is
to be removed (no automatic cleanup happens if your module is unloaded,
you are responsible here.) If an error occurs, NULL
will be returned.
If debugfs is not enabled in the kernel, the value -ENODEV
will be
returned. It is not wise to check for this value, but rather, check for
NULL
or !NULL
instead as to eliminate the need for #ifdef in the calling
code.
Table of Contents
The journalling layer is easy to use. You need to first of all create a journal_t data structure. There are two calls to do this dependent on how you decide to allocate the physical media on which the journal resides. The journal_init_inode() call is for journals stored in filesystem inodes, or the journal_init_dev() call can be use for journal stored on a raw device (in a continuous range of blocks). A journal_t is a typedef for a struct pointer, so when you are finally finished make sure you call journal_destroy() on it to free up any used kernel memory.
Once you have got your journal_t object you need to 'mount' or load the journal file, unless of course you haven't initialised it yet - in which case you need to call journal_create().
Most of the time however your journal file will already have been created, but before you load it you must call journal_wipe() to empty the journal file. Hang on, you say , what if the filesystem wasn't cleanly umount()'d . Well, it is the job of the client file system to detect this and skip the call to journal_wipe().
In either case the next call should be to journal_load() which prepares the journal file for use. Note that journal_wipe(..,0) calls journal_skip_recovery() for you if it detects any outstanding transactions in the journal and similarly journal_load() will call journal_recover() if necessary. I would advise reading fs/ext3/super.c for examples on this stage. [RGG: Why is the journal_wipe() call necessary - doesn't this needlessly complicate the API. Or isn't a good idea for the journal layer to hide dirty mounts from the client fs]
Now you can go ahead and start modifying the underlying filesystem. Almost.
You still need to actually journal your filesystem changes, this is done by wrapping them into transactions. Additionally you also need to wrap the modification of each of the buffers with calls to the journal layer, so it knows what the modifications you are actually making are. To do this use journal_start() which returns a transaction handle.
journal_start() and its counterpart journal_stop(), which indicates the end of a transaction are nestable calls, so you can reenter a transaction if necessary, but remember you must call journal_stop() the same number of times as journal_start() before the transaction is completed (or more accurately leaves the update phase). Ext3/VFS makes use of this feature to simplify quota support.
Inside each transaction you need to wrap the modifications to the individual buffers (blocks). Before you start to modify a buffer you need to call journal_get_{create,write,undo}_access() as appropriate, this allows the journalling layer to copy the unmodified data if it needs to. After all the buffer may be part of a previously uncommitted transaction. At this point you are at last ready to modify a buffer, and once you are have done so you need to call journal_dirty_{meta,}data(). Or if you've asked for access to a buffer you now know is now longer required to be pushed back on the device you can call journal_forget() in much the same way as you might have used bforget() in the past.
A journal_flush() may be called at any time to commit and checkpoint all your transactions.
Then at umount time , in your put_super() (2.4) or write_super() (2.5) you can then call journal_destroy() to clean up your in-core journal object.
Unfortunately there a couple of ways the journal layer can cause a deadlock. The first thing to note is that each task can only have a single outstanding transaction at any one time, remember nothing commits until the outermost journal_stop(). This means you must complete the transaction at the end of each file/inode/address etc. operation you perform, so that the journalling system isn't re-entered on another journal. Since transactions can't be nested/batched across differing journals, and another filesystem other than yours (say ext3) may be modified in a later syscall.
The second case to bear in mind is that journal_start() can block if there isn't enough space in the journal for your transaction (based on the passed nblocks param) - when it blocks it merely(!) needs to wait for transactions to complete and be committed from other tasks, so essentially we are waiting for journal_stop(). So to avoid deadlocks you must treat journal_start/stop() as if they were semaphores and include them in your semaphore ordering rules to prevent deadlocks. Note that journal_extend() has similar blocking behaviour to journal_start() so you can deadlock here just as easily as on journal_start().
Try to reserve the right number of blocks the first time. ;-). This will be the maximum number of blocks you are going to touch in this transaction. I advise having a look at at least ext3_jbd.h to see the basis on which ext3 uses to make these decisions.
Another wriggle to watch out for is your on-disk block allocation strategy. why? Because, if you undo a delete, you need to ensure you haven't reused any of the freed blocks in a later transaction. One simple way of doing this is make sure any blocks you allocate only have checkpointed transactions listed against them. Ext3 does this in ext3_test_allocatable().
Lock is also providing through journal_{un,}lock_updates(), ext3 uses this when it wants a window with a clean and stable fs for a moment. eg.
journal_lock_updates() //stop new stuff happening.. journal_flush() // checkpoint everything. ..do stuff on stable fs journal_unlock_updates() // carry on with filesystem use.
The opportunities for abuse and DOS attacks with this should be obvious, if you allow unprivileged userspace to trigger codepaths containing these calls.
A new feature of jbd since 2.5.25 is commit callbacks with the new journal_callback_set() function you can now ask the journalling layer to call you back when the transaction is finally committed to disk, so that you can do some of your own management. The key to this is the journal_callback struct, this maintains the internal callback information but you can extend it like this:-
struct myfs_callback_s { //Data structure element required by jbd.. struct journal_callback for_jbd; // Stuff for myfs allocated together. myfs_inode* i_commited; }
this would be useful if you needed to know when data was committed to a particular inode.
Using the journal is a matter of wrapping the different context changes, being each mount, each modification (transaction) and each changed buffer to tell the journalling layer about them.
Here is a some pseudo code to give you an idea of how it works, as an example.
journal_t* my_jnrl = journal_create(); journal_init_{dev,inode}(jnrl,...) if (clean) journal_wipe(); journal_load(); foreach(transaction) { /*transactions must be completed before a syscall returns to userspace*/ handle_t * xct=journal_start(my_jnrl); foreach(bh) { journal_get_{create,write,undo}_access(xact,bh); if ( myfs_modify(bh) ) { /* returns true if makes changes */ journal_dirty_{meta,}data(xact,bh); } else { journal_forget(bh); } } journal_stop(xct); } journal_destroy(my_jrnl);
The journalling layer uses typedefs to 'hide' the concrete definitions of the structures used. As a client of the JBD layer you can just rely on the using the pointer as a magic cookie of some sort. Obviously the hiding is not enforced as this is 'C'.
typedef handle_t — The handle_t type represents a single atomic update being performed by some process.
typedef handle_t;
All filesystem modifications made by the process go through this handle. Recursive operations (such as quota operations) are gathered into a single update.
The buffer credits field is used to account for journaled buffers being modified by the running process. To ensure that there is enough log space for all outstanding operations, we need to limit the number of outstanding buffers possible at any time. When the operation completes, any buffer credits not used are credited back to the transaction, so that at all times we know how many buffers the outstanding updates on a transaction might possibly touch.
This is an opaque datatype.
typedef journal_t — The journal_t maintains all of the journaling state information for a single filesystem.
typedef journal_t;
struct handle_s — this is the concrete type associated with handle_t.
struct handle_s { transaction_t * h_transaction; int h_buffer_credits; int h_ref; int h_err; unsigned int h_sync:1; unsigned int h_jdata:1; unsigned int h_aborted:1; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map h_lockdep_map; #endif };
Which compound transaction is this update a part of?
Number of remaining buffers we are allowed to dirty.
Reference count on this handle
Field for caller's use to track errors through large fs operations
flag for sync-on-close
flag to force data journaling
flag indicating fatal error on handle
lockdep info for debugging lock problems
struct journal_s — this is the concrete type associated with journal_t.
struct journal_s { unsigned long j_flags; int j_errno; struct buffer_head * j_sb_buffer; journal_superblock_t * j_superblock; int j_format_version; spinlock_t j_state_lock; int j_barrier_count; struct mutex j_barrier; transaction_t * j_running_transaction; transaction_t * j_committing_transaction; transaction_t * j_checkpoint_transactions; wait_queue_head_t j_wait_transaction_locked; wait_queue_head_t j_wait_logspace; wait_queue_head_t j_wait_done_commit; wait_queue_head_t j_wait_checkpoint; wait_queue_head_t j_wait_commit; wait_queue_head_t j_wait_updates; struct mutex j_checkpoint_mutex; unsigned int j_head; unsigned int j_tail; unsigned int j_free; unsigned int j_first; unsigned int j_last; struct block_device * j_dev; int j_blocksize; unsigned int j_blk_offset; struct block_device * j_fs_dev; unsigned int j_maxlen; spinlock_t j_list_lock; struct inode * j_inode; tid_t j_tail_sequence; tid_t j_transaction_sequence; tid_t j_commit_sequence; tid_t j_commit_request; __u8 j_uuid[16]; struct task_struct * j_task; int j_max_transaction_buffers; unsigned long j_commit_interval; struct timer_list j_commit_timer; spinlock_t j_revoke_lock; struct jbd_revoke_table_s * j_revoke; struct jbd_revoke_table_s * j_revoke_table[2]; struct buffer_head ** j_wbuf; int j_wbufsize; pid_t j_last_sync_writer; u64 j_average_commit_time; void * j_private; };
General journaling state flags
Is there an outstanding uncleared error on the journal (from a prior abort)?
First part of superblock buffer
Second part of superblock buffer
Version of the superblock format
Protect the various scalars in the journal
Number of processes waiting to create a barrier lock
The barrier lock itself
The current running transaction..
the transaction we are pushing to disk
a linked circular list of all transactions waiting for checkpointing
Wait queue for waiting for a locked transaction to start committing, or for a barrier lock to be released
Wait queue for waiting for checkpointing to complete
Wait queue for waiting for commit to complete
Wait queue to trigger checkpointing
Wait queue to trigger commit
Wait queue to wait for updates to complete
Mutex for locking against concurrent checkpoints
Journal head - identifies the first unused block in the journal
Journal tail - identifies the oldest still-used block in the journal.
Journal free - how many free blocks are there in the journal?
The block number of the first usable block
The block number one beyond the last usable block
Device where we store the journal
blocksize for the location where we store the journal.
starting block offset for into the device where we store the journal
Device which holds the client fs. For internal journal this will be equal to j_dev
Total maximum capacity of the journal region on disk.
Protects the buffer lists and internal buffer state.
Optional inode where we store the journal. If present, all journal
block numbers are mapped into this inode via bmap
.
Sequence number of the oldest transaction in the log
Sequence number of the next transaction to grant
Sequence number of the most recently committed transaction
Sequence number of the most recent transaction wanting commit
Uuid of client object.
Pointer to the current commit thread for this journal
Maximum number of metadata buffers to allow in a single compound commit transaction
What is the maximum transaction lifetime before we begin a commit?
The timer used to wakeup the commit thread
Protect the revoke table
The revoke table - maintains the list of revoked blocks in the current transaction.
alternate revoke tables for j_revoke
array of buffer_heads for journal_commit_transaction
maximum number of buffer_heads allowed in j_wbuf, the number that will fit in j_blocksize
most recent pid which did a synchronous write
the average amount of time in nanoseconds it takes to commit a transaction to the disk.
An opaque pointer to fs-private information.
The functions here are split into two groups those that affect a journal as a whole, and those which are used to manage transactions
journal_init_dev — creates and initialises a journal structure
journal_t * journal_init_dev ( | bdev, | |
fs_dev, | ||
start, | ||
len, | ||
blocksize) ; |
struct block_device * bdev
;struct block_device * fs_dev
;int start
;int len
;int blocksize
;journal_init_inode — creates a journal which maps to a inode.
journal_t * journal_init_inode ( | inode) ; |
struct inode * inode
;journal_create — Initialise the new journal file
int journal_create ( | journal) ; |
journal_t * journal
;journal_destroy — Release a journal_t structure.
int journal_destroy ( | journal) ; |
journal_t * journal
;journal_check_used_features — Check if features specified are used.
int journal_check_used_features ( | journal, | |
compat, | ||
ro, | ||
incompat) ; |
journal_t * journal
;unsigned long compat
;unsigned long ro
;unsigned long incompat
;journal_check_available_features — Check feature set in journalling layer
int journal_check_available_features ( | journal, | |
compat, | ||
ro, | ||
incompat) ; |
journal_t * journal
;unsigned long compat
;unsigned long ro
;unsigned long incompat
;journal_set_features — Mark a given journal feature in the superblock
int journal_set_features ( | journal, | |
compat, | ||
ro, | ||
incompat) ; |
journal_t * journal
;unsigned long compat
;unsigned long ro
;unsigned long incompat
;journal_update_format — Update on-disk journal structure.
int journal_update_format ( | journal) ; |
journal_t * journal
;journal_wipe — Wipe journal contents
int journal_wipe ( | journal, | |
write) ; |
journal_t * journal
;int write
;
Wipe out all of the contents of a journal, safely. This will produce
a warning if the journal contains any valid recovery information.
Must be called between journal_init_*() and journal_load
.
If 'write' is non-zero, then we wipe out the journal on disk; otherwise we merely suppress recovery.
journal_abort — Shutdown the journal immediately.
void journal_abort ( | journal, | |
errno) ; |
journal_t * journal
;int errno
;journal
the journal to shutdown.
errno
an error number to record in the journal indicating the reason for the shutdown.
Perform a complete, immediate shutdown of the ENTIRE journal (not of a single transaction). This operation cannot be undone without closing and reopening the journal.
The journal_abort function is intended to support higher level error recovery mechanisms such as the ext2/ext3 remount-readonly error mode.
Journal abort has very specific semantics. Any existing dirty, unjournaled buffers in the main filesystem will still be written to disk by bdflush, but the journaling mechanism will be suspended immediately and no further transaction commits will be honoured.
Any dirty, journaled buffers will be written back to disk without hitting the journal. Atomicity cannot be guaranteed on an aborted filesystem, but we _do_ attempt to leave as much data as possible behind for fsck to use for cleanup.
Any attempt to get a new transaction handle on a journal which is in ABORT state will just result in an -EROFS error return. A journal_stop on an existing handle will return -EIO if we have entered abort state during the update.
Recursive transactions are not disturbed by journal abort until the final journal_stop, which will receive the -EIO error.
Finally, the journal_abort call allows the caller to supply an errno which will be recorded (if possible) in the journal superblock. This allows a client to record failure conditions in the middle of a transaction without having to complete the transaction to record the failure to disk. ext3_error, for example, now uses this functionality.
Errors which originate from within the journaling layer will NOT supply an errno; a null errno implies that absolutely no further writes are done to the journal (unless there are any already in progress).
journal_errno — returns the journal's error state.
int journal_errno ( | journal) ; |
journal_t * journal
;journal_clear_err — clears the journal's error state
int journal_clear_err ( | journal) ; |
journal_t * journal
;journal_recover — recovers a on-disk journal
int journal_recover ( | journal) ; |
journal_t * journal
;The primary function for recovering the log contents when mounting a journaled device.
Recovery is done in three passes. In the first pass, we look for the end of the log. In the second, we assemble the list of revoke blocks. In the third and final pass, we replay any un-revoked blocks in the log.
journal_skip_recovery — Start journal and wipe exiting records
int journal_skip_recovery ( | journal) ; |
journal_t * journal
;Locate any valid recovery information from the journal and set up the journal structures in memory to ignore it (presumably because the caller has evidence that it is out of date). This function does'nt appear to be exorted..
We perform one pass over the journal to allow us to tell the user how much recovery information is being erased, and to let us initialise the journal transaction sequence numbers to the next unused ID.
journal_start — Obtain a new handle.
handle_t * journal_start ( | journal, | |
nblocks) ; |
journal_t * journal
;int nblocks
;We make sure that the transaction can guarantee at least nblocks of modified buffers in the log. We block until the log can guarantee that much space.
This function is visible to journal users (like ext3fs), so is not called with the journal already locked.
Return a pointer to a newly allocated handle, or an ERR_PTR
value
on failure.
journal_extend — extend buffer credits.
int journal_extend ( | handle, | |
nblocks) ; |
handle_t * handle
;int nblocks
;Some transactions, such as large extends and truncates, can be done atomically all at once or in several stages. The operation requests a credit for a number of buffer modications in advance, but can extend its credit if it needs more.
journal_extend tries to give the running handle more buffer credits. It does not guarantee that allocation - this is a best-effort only. The calling process MUST be able to deal cleanly with a failure to extend here.
Return 0 on success, non-zero on failure.
return code < 0 implies an error return code > 0 implies normal transaction-full status.
journal_restart — restart a handle.
int journal_restart ( | handle, | |
nblocks) ; |
handle_t * handle
;int nblocks
;Restart a handle for a multi-transaction filesystem operation.
If the journal_extend
call above fails to grant new buffer credits
to a running handle, a call to journal_restart will commit the
handle's transaction so far and reattach the handle to a new
transaction capabable of guaranteeing the requested number of
credits.
journal_lock_updates — establish a transaction barrier.
void journal_lock_updates ( | journal) ; |
journal_t * journal
;journal_unlock_updates — release barrier
void journal_unlock_updates ( | journal) ; |
journal_t * journal
;journal_get_write_access — notify intent to modify a buffer for metadata (not data) update.
int journal_get_write_access ( | handle, | |
bh) ; |
handle_t * handle
;struct buffer_head * bh
;journal_get_create_access — notify intent to use newly created bh
int journal_get_create_access ( | handle, | |
bh) ; |
handle_t * handle
;struct buffer_head * bh
;journal_get_undo_access — Notify intent to modify metadata with non-rewindable consequences
int journal_get_undo_access ( | handle, | |
bh) ; |
handle_t * handle
;struct buffer_head * bh
;Sometimes there is a need to distinguish between metadata which has been committed to disk and that which has not. The ext3fs code uses this for freeing and allocating space, we have to make sure that we do not reuse freed space until the deallocation has been committed, since if we overwrote that space we would make the delete un-rewindable in case of a crash.
To deal with that, journal_get_undo_access requests write access to a buffer for parts of non-rewindable operations such as delete operations on the bitmaps. The journaling code must keep a copy of the buffer's contents prior to the undo_access call until such time as we know that the buffer has definitely been committed to disk.
We never need to know which transaction the committed data is part of, buffers touched here are guaranteed to be dirtied later and so will be committed to a new transaction in due course, at which point we can discard the old committed data pointer.
Returns error number or 0 on success.
journal_dirty_data — mark a buffer as containing dirty data to be flushed
int journal_dirty_data ( | handle, | |
bh) ; |
handle_t * handle
;struct buffer_head * bh
;Mark a buffer as containing dirty data which needs to be flushed before we can commit the current transaction.
The buffer is placed on the transaction's data list and is marked as belonging to the transaction.
Returns error number or 0 on success.
journal_dirty_data
can be called via page_launder->ext3_writepage
by kswapd.
journal_dirty_metadata — mark a buffer as containing dirty metadata
int journal_dirty_metadata ( | handle, | |
bh) ; |
handle_t * handle
;struct buffer_head * bh
;Mark dirty metadata which needs to be journaled as part of the current transaction.
The buffer is placed on the transaction's metadata list and is marked as belonging to the transaction.
Returns error number or 0 on success.
Special care needs to be taken if the buffer already belongs to the current committing transaction (in which case we should have frozen data present for that commit). In that case, we don't relink the
journal_forget —
bforget
for potentially-journaled buffers.
int journal_forget ( | handle, | |
bh) ; |
handle_t * handle
;struct buffer_head * bh
;We can only do the bforget if there are no commits pending against the buffer. If the buffer is dirty in the current running transaction we can safely unlink it.
bh may not be a journalled buffer at all - it may be a non-JBD buffer which came off the hashtable. Check for this.
Decrements bh->b_count by one.
Allow this call even if the handle has aborted --- it may be part of the caller's cleanup after an abort.
journal_stop — complete a transaction
int journal_stop ( | handle) ; |
handle_t * handle
;All done for a particular handle.
There is not much action needed here. We just return any remaining buffer credits to the transaction and remove the handle. The only complication is that we need to start a commit operation if the filesystem is marked for synchronous update.
journal_stop itself will not usually return an error, but it may do so in unusual circumstances. In particular, expect it to return -EIO if a journal_abort has been executed since the transaction began.
journal_force_commit — force any uncommitted transactions
int journal_force_commit ( | journal) ; |
journal_t * journal
;journal_try_to_free_buffers — try to free page buffers.
int journal_try_to_free_buffers ( | journal, | |
page, | ||
gfp_mask) ; |
journal_t * journal
;struct page * page
;gfp_t gfp_mask
;journal
journal for operation
page
to try and free
gfp_mask
we use the mask to detect how hard should we try to release buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to release the buffers.
For all the buffers on this page,
if they are fully written out ordered data, move them onto BUF_CLEAN
so try_to_free_buffers
can reap them.
This function returns non-zero if we wish try_to_free_buffers
to be called. We do this if the page is releasable by try_to_free_buffers
.
We also do it if the page has locked or dirty buffers and the caller wants
us to perform sync or async writeout.
This complicates JBD locking somewhat. We aren't protected by the BKL here. We wish to remove the buffer from its committing or running transaction's ->t_datalist via __journal_unfile_buffer.
This may *change* the value of transaction_t->t_datalist, so anyone who looks at t_datalist needs to lock against this function.
Even worse, someone may be doing a journal_dirty_data on this
buffer. So we need to lock against that. journal_dirty_data
will come out of the lock with the buffer dirty, which makes it
ineligible for release here.
Who else is affected by this? hmm... Really the only contender
is do_get_write_access
- it could be looking at the buffer while
journal_try_to_free_buffer
is changing its state. But that
cannot happen because we never reallocate freed data as metadata
while the data is part of a transaction. Yes?
Return 0 on failure, 1 on success
Table of Contents
splice is a method for moving blocks of data around inside the kernel, without continually transferring them between the kernel and user space.
splice_to_pipe — fill passed data into a pipe
ssize_t splice_to_pipe ( | pipe, | |
spd) ; |
struct pipe_inode_info * pipe
;struct splice_pipe_desc * spd
;generic_file_splice_read — splice data from file to a pipe
ssize_t generic_file_splice_read ( | in, | |
ppos, | ||
pipe, | ||
len, | ||
flags) ; |
struct file * in
;loff_t * ppos
;struct pipe_inode_info * pipe
;size_t len
;unsigned int flags
;splice_from_pipe_feed — feed available data from a pipe to a file
int splice_from_pipe_feed ( | pipe, | |
sd, | ||
actor) ; |
struct pipe_inode_info * pipe
;struct splice_desc * sd
;splice_actor * actor
;
This function loops over the pipe and calls actor
to do the
actual moving of a single struct pipe_buffer to the desired
destination. It returns when there's no more buffers left in
the pipe or if the requested number of bytes (sd
->total_len)
have been copied. It returns a positive number (one) if the
pipe needs to be filled with more data, zero if the required
number of bytes have been copied and -errno on error.
This, together with splice_from_pipe_{begin,end,next}, may be
used to implement the functionality of __splice_from_pipe
when
locking is required around copying the pipe buffers to the
destination.
splice_from_pipe_next — wait for some data to splice from
int splice_from_pipe_next ( | pipe, | |
sd) ; |
struct pipe_inode_info * pipe
;struct splice_desc * sd
;splice_from_pipe_begin — start splicing from pipe
void splice_from_pipe_begin ( | sd) ; |
struct splice_desc * sd
;splice_from_pipe_end — finish splicing from pipe
void splice_from_pipe_end ( | pipe, | |
sd) ; |
struct pipe_inode_info * pipe
;struct splice_desc * sd
;__splice_from_pipe — splice data from a pipe to given actor
ssize_t __splice_from_pipe ( | pipe, | |
sd, | ||
actor) ; |
struct pipe_inode_info * pipe
;struct splice_desc * sd
;splice_actor * actor
;splice_from_pipe — splice data from a pipe to a file
ssize_t splice_from_pipe ( | pipe, | |
out, | ||
ppos, | ||
len, | ||
flags, | ||
actor) ; |
struct pipe_inode_info * pipe
;struct file * out
;loff_t * ppos
;size_t len
;unsigned int flags
;splice_actor * actor
;generic_file_splice_write — splice data from a pipe to a file
ssize_t generic_file_splice_write ( | pipe, | |
out, | ||
ppos, | ||
len, | ||
flags) ; |
struct pipe_inode_info * pipe
;struct file * out
;loff_t * ppos
;size_t len
;unsigned int flags
;generic_splice_sendpage — splice data from a pipe to a socket
ssize_t generic_splice_sendpage ( | pipe, | |
out, | ||
ppos, | ||
len, | ||
flags) ; |
struct pipe_inode_info * pipe
;struct file * out
;loff_t * ppos
;size_t len
;unsigned int flags
;splice_direct_to_actor — splices data directly between two non-pipes
ssize_t splice_direct_to_actor ( | in, | |
sd, | ||
actor) ; |
struct file * in
;struct splice_desc * sd
;splice_direct_actor * actor
;do_splice_direct — splices data directly between two files
long do_splice_direct ( | in, | |
ppos, | ||
out, | ||
len, | ||
flags) ; |
struct file * in
;loff_t * ppos
;struct file * out
;size_t len
;unsigned int flags
;Table of Contents
Pipe interfaces are all for in-kernel (builtin image) use. They are not exported for use by modules.
struct pipe_buffer — a linux kernel pipe buffer
struct pipe_buffer { struct page * page; unsigned int offset; unsigned int len; const struct pipe_buf_operations * ops; unsigned int flags; unsigned long private; };
struct pipe_inode_info — a linux kernel pipe
struct pipe_inode_info { wait_queue_head_t wait; unsigned int nrbufs; unsigned int curbuf; unsigned int buffers; unsigned int readers; unsigned int writers; unsigned int waiting_writers; unsigned int r_counter; unsigned int w_counter; struct page * tmp_page; struct fasync_struct * fasync_readers; struct fasync_struct * fasync_writers; struct inode * inode; struct pipe_buffer * bufs; };
reader/writer wait point in case of empty/full pipe
the number of non-empty pipe buffers in this pipe
the current pipe buffer entry
total number of buffers (should be a power of 2)
number of current readers of this pipe
number of current writers of this pipe
number of writers blocked waiting for room
reader counter
writer counter
cached released page
reader side fasync
writer side fasync
inode this pipe is attached to
the circular array of pipe buffers
generic_pipe_buf_map — virtually map a pipe buffer
void * generic_pipe_buf_map ( | pipe, | |
buf, | ||
atomic) ; |
struct pipe_inode_info * pipe
;struct pipe_buffer * buf
;int atomic
;generic_pipe_buf_unmap — unmap a previously mapped pipe buffer
void generic_pipe_buf_unmap ( | pipe, | |
buf, | ||
map_data) ; |
struct pipe_inode_info * pipe
;struct pipe_buffer * buf
;void * map_data
;generic_pipe_buf_steal — attempt to take ownership of a pipe_buffer
int generic_pipe_buf_steal ( | pipe, | |
buf) ; |
struct pipe_inode_info * pipe
;struct pipe_buffer * buf
;generic_pipe_buf_get — get a reference to a struct pipe_buffer
void generic_pipe_buf_get ( | pipe, | |
buf) ; |
struct pipe_inode_info * pipe
;struct pipe_buffer * buf
;generic_pipe_buf_confirm — verify contents of the pipe buffer
int generic_pipe_buf_confirm ( | info, | |
buf) ; |
struct pipe_inode_info * info
;struct pipe_buffer * buf
;