<linux kernel> 3.2 pdflush change

sync_inode - write an inode and its pages to disk.

* @inode: the inode to sync

The function is responsible to synchronize all dirty inodes belongings to a given superblock

task = kthread_create(bdi_writeback_thread, &bdi->wb,

"flush-%s", dev_name(bdi->dev));

root@szx3:/home/szx# ps -ef|grep flush

root 950 2 0 08:55 ? 00:00:00 [flush-8:0]

bdi_writeback_thread

struct bdi_writeback {

struct backing_dev_info *bdi; /* our parent bdi */

unsigned int nr;

unsigned long last_old_flush; /* last old data flush */

unsigned long last_active; /* last time bdi thread was active */

struct task_struct *task; /* writeback thread */

struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */

struct list_head b_dirty; /* dirty inodes */

struct list_head b_io; /* parked for writeback */

struct list_head b_more_io; /* parked for more writeback */

spinlock_t list_lock; /* protects the b_* lists */

};

struct backing_dev_info {

struct list_head bdi_list;

unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */

unsigned long state; /* Always use atomic bitops on this */

unsigned int capabilities; /* Device capabilities */

congested_fn *congested_fn; /* Function pointer if device is md/dm */

void *congested_data; /* Pointer to aux data for congested func */

char *name;

struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];

unsigned long bw_time_stamp; /* last time write bw is updated */

unsigned long dirtied_stamp;

unsigned long written_stamp; /* pages written at bw_time_stamp */

unsigned long write_bandwidth; /* the estimated write bandwidth */

unsigned long avg_write_bandwidth; /* further smoothed write bw */

* The base dirty throttle rate, re-calculated on every 200ms.

* All the bdi tasks' dirty rate will be curbed under it.

* @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit

* in small steps and is much more smooth/stable than the latter.

unsigned long dirty_ratelimit;

unsigned long balanced_dirty_ratelimit;

struct prop_local_percpu completions;

int dirty_exceeded;

unsigned int min_ratio;

unsigned int max_ratio, max_prop_frac;

struct bdi_writeback wb; /* default writeback info for this bdi */

spinlock_t wb_lock; /* protects work_list */

struct list_head work_list;

struct device *dev;

struct timer_list laptop_mode_wb_timer;

#ifdef CONFIG_DEBUG_FS

struct dentry *debug_dir;

struct dentry *debug_stats;

#endif

};

* Passed into wb_writeback(), essentially a subset of writeback_control

struct wb_writeback_work {

long nr_pages;

struct super_block *sb;

unsigned long *older_than_this;

enum writeback_sync_modes sync_mode;

unsigned int tagged_writepages:1;

unsigned int for_kupdate:1;

unsigned int range_cyclic:1;

unsigned int for_background:1;

enum wb_reason reason; /* why was writeback initiated? */

struct list_head list; /* pending work list */

struct completion *done; /* set if the caller waits */

};

long wb_do_writeback(struct bdi_writeback *wb, int force_wait)

{

struct backing_dev_info *bdi = wb->bdi;

struct wb_writeback_work *work;

long wrote = 0;

set_bit(BDI_writeback_running, &wb->bdi->state);

while ((work = get_next_work_item(bdi)) != NULL) {

* Override sync mode, in case we must wait for completion

* because this thread is exiting now.

if (force_wait)

work->sync_mode = WB_SYNC_ALL;

trace_writeback_exec(bdi, work);

wrote += wb_writeback(wb, work);

* Notify the caller of completion if this is a synchronous

* work item, otherwise just free it.

if (work->done)

complete(work->done);

else

kfree(work);

}

* Check for periodic writeback, kupdated() style

wrote += wb_check_old_data_flush(wb);

wrote += wb_check_background_flush(wb);

clear_bit(BDI_writeback_running, &wb->bdi->state);

return wrote;

}

the first time one of an inode's pages is dirtied, we mark the dirtying-time in the inode's address_space So this periodic writeback code just walks the superblock inode list, writing back any inodes which are older than a specific point in time.

__wait_on_bit

wait_queue_head_t *bit_waitqueue(void *word, int bit)

{

const int shift = BITS_PER_LONG == 32 ? 5 : 6;

const struct zone *zone = page_zone(virt_to_page(word));

unsigned long val = (unsigned long)word << shift | bit;

return &zone->wait_table[hash_long(val, zone->wait_table_bits)];

}

Zone -> wait_table

Zone -> wait_table_bits

Power-of-2 order of the size of the wait queue hash table array

The purpose of all there is to keep track of the people waiting for a page to become available and make them runnable again when possible . The trouble is that this consumes a lot of space ,especially when so few things wait on pages at a given time.

So instead of using per-page wait-queues , we use a waitqueue hash table.

BSF - Bit Scan Forward (386+)

Usage: BSF dest,src

Modifies flags: ZF

Scans source operand for first bit set. Sets ZF if a bit is found set and loads the destination with an index to first set bit. Clears ZF is no bits are found set. BSF scans forward across bit pattern (0-n) while BSR scans in reverse (n-0).

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

{

int i;

struct pglist_data *pgdat = zone->zone_pgdat;

size_t alloc_size;

* The per-page waitqueue mechanism uses hashed waitqueues

* per zone.

zone->wait_table_hash_nr_entries =

wait_table_hash_nr_entries(zone_size_pages);

zone->wait_table_bits =

wait_table_bits(zone->wait_table_hash_nr_entries);

//....

}

从低位开始搜索

static inline unsigned long __ffs(unsigned long word)

{

asm("bsf %1,%0"

: "=r" (word)

: "rm" (word));

return word;

}

static void inode_wait_for_writeback(struct inode *inode,

struct bdi_writeback *wb)

{

DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);

wait_queue_head_t *wqh;

wqh = bit_waitqueue(&inode->i_state, __I_SYNC);

while (inode->i_state & I_SYNC) {

spin_unlock(&inode->i_lock);

spin_unlock(&wb->list_lock);

__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);

spin_lock(&wb->list_lock);

spin_lock(&inode->i_lock);

}

The kernel can start to synchronize data from various different places , but all paths save one end up in

[sysc_sb_inodes] The function is responsible to synchronize all dirty inodes belonging to a given superblock ,

[writeback_single_inode ] is used for each inode .

static int

writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,

struct writeback_control *wbc)

{

struct address_space *mapping = inode->i_mapping;

long nr_to_write = wbc->nr_to_write;

unsigned dirty;

int ret;

assert_spin_locked(&wb->list_lock);

assert_spin_locked(&inode->i_lock);

if (!atomic_read(&inode->i_count))

WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));

else

WARN_ON(inode->i_state & I_WILL_FREE);

if (inode->i_state & I_SYNC) {

* If this inode is locked for writeback and we are not doing

* writeback-for-data-integrity, move it to b_more_io so that

* writeback can proceed with the other inodes on s_io.

* We'll have another go at writing back this inode when we

* completed a full scan of b_io.

if (wbc->sync_mode != WB_SYNC_ALL) {

requeue_io(inode, wb);

trace_writeback_single_inode_requeue(inode, wbc,

nr_to_write);

return 0;

}

* It's a data-integrity sync. We must wait.

inode_wait_for_writeback(inode, wb);

}

BUG_ON(inode->i_state & I_SYNC);

/* Set I_SYNC, reset I_DIRTY_PAGES */

inode->i_state |= I_SYNC;

inode->i_state &= ~I_DIRTY_PAGES;

spin_unlock(&inode->i_lock);

spin_unlock(&wb->list_lock);

ret = do_writepages(mapping, wbc);

* Make sure to wait on the data before writing out the metadata.

* This is important for filesystems that modify metadata on data

* I/O completion.

if (wbc->sync_mode == WB_SYNC_ALL) {

int err = filemap_fdatawait(mapping);

if (ret == 0)

ret = err;

}

* Some filesystems may redirty the inode during the writeback

* due to delalloc, clear dirty metadata flags right before

* write_inode()

spin_lock(&inode->i_lock);

dirty = inode->i_state & I_DIRTY;

inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);

spin_unlock(&inode->i_lock);

/* Don't write the inode if only I_DIRTY_PAGES was set */

if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {

int err = write_inode(inode, wbc);

if (ret == 0)

ret = err;

}

spin_lock(&wb->list_lock);

spin_lock(&inode->i_lock);

inode->i_state &= ~I_SYNC;

if (!(inode->i_state & I_FREEING)) {

* Sync livelock prevention. Each inode is tagged and synced in

* one shot. If still dirty, it will be redirty_tail()'ed below.

* Update the dirty time to prevent enqueue and sync it again.

if ((inode->i_state & I_DIRTY) &&

(wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))

inode->dirtied_when = jiffies;

if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {

* We didn't write back all the pages. nfs_writepages()

* sometimes bales out without doing anything.

inode->i_state |= I_DIRTY_PAGES;

if (wbc->nr_to_write <= 0) {

* slice used up: queue for next turn

requeue_io(inode, wb);

} else {

* Writeback blocked by something other than

* congestion. Delay the inode for some time to

* avoid spinning on the CPU (100% iowait)

* retrying writeback of the dirty page/inode

* that cannot be performed immediately.

redirty_tail(inode, wb);

}

} else if (inode->i_state & I_DIRTY) {

* Filesystems can dirty the inode during writeback

* operations, such as delayed allocation during

* submission or metadata updates after data IO

* completion.

redirty_tail(inode, wb);

} else {

* The inode is clean. At this point we either have

* a reference to the inode or it's on it's way out.

* No need to add it back to the LRU.

list_del_init(&inode->i_wb_list);

}

inode_sync_complete(inode);

trace_writeback_single_inode(inode, wbc, nr_to_write);

return ret;

}

<linux kernel> 3.2 pdflush change

猜你喜欢