實現過程其實和一個驅動很像,部分文件系統的接口調用即可。
- start()函數完成讀數據前的一些預先操作,通常如加鎖,定位數據記錄位置等,該函數返回值就是show()函數第二個參數
- show()函數實現讀數據過程,將要輸出的數據直接用seq_printf()函數打印到seq流緩衝區中,由seq_printf()函數輸出到用戶空間
轉載一個圖片,內容是關於proc和seq_file的基本聯繫。
============
fs/jbd2/journal.c
@@ -1050,12 +1050,14 @@ static void jbd2_stats_proc_init(journal_t *journal)
if (journal->j_proc_entry) {
proc_create_data("info", S_IRUGO, journal->j_proc_entry,
&jbd2_seq_info_fops, journal);
+ jbd2_mon_proc_init(journal->j_proc_entry, journal);
}
}
static void jbd2_stats_proc_exit(journal_t *journal)
{
remove_proc_entry("info", journal->j_proc_entry);
+ jbd2_mon_proc_exit(journal->j_proc_entry);
remove_proc_entry(journal->j_devname, proc_jbd2_stats);
}
@@ -1139,6 +1141,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
struct buffer_head *bh;
char *p;
int n;
+ int i = 0, j = 0;
if (!journal)
return NULL;
@@ -1172,11 +1175,22 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
}
journal->j_sb_buffer = bh;
journal->j_superblock = (journal_superblock_t *)bh->b_data;
+ for (i = 0; i < MAX_JBD2_MONITOR_BUFFER; i++) {
+ journal->j_monitor_buffer[i] = kmalloc(sizeof(struct jbd2_delay_stat), GFP_KERNEL);
+ if (!journal->j_monitor_buffer[i]) {
+ printk(KERN_ERR "%s: Cannot allocate buffer for memory\n",
+ __func__);
+ goto out_err;
+ }
+ jbd2_mon_buffer_init(journal->j_monitor_buffer[i]);
+ }
return journal;
out_err:
kfree(journal->j_wbuf);
jbd2_stats_proc_exit(journal);
+ for (j = 0; j < i; j++)
+ kfree(journal->j_monitor_buffer[j]);
kfree(journal);
return NULL;
}
@@ -1197,6 +1211,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
int err;
int n;
unsigned long long blocknr;
+ int i = 0, j = 0;
if (!journal)
return NULL;
@@ -1246,10 +1261,21 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
}
journal->j_sb_buffer = bh;
journal->j_superblock = (journal_superblock_t *)bh->b_data;
+ for (i = 0; i < MAX_JBD2_MONITOR_BUFFER; i++) {
+ journal->j_monitor_buffer[i] = kmalloc(sizeof(struct jbd2_delay_stat), GFP_KERNEL);
+ if (!journal->j_monitor_buffer[i]) {
+ printk(KERN_ERR "%s: Cannot allocate buffer for memory\n",
+ __func__);
+ goto out_err;
+ }
+ jbd2_mon_buffer_init(journal->j_monitor_buffer[i]);
+ }
return journal;
out_err:
kfree(journal->j_wbuf);
+ for (j = 0; j < i; j++)
+ kfree(journal->j_monitor_buffer[j]);
jbd2_stats_proc_exit(journal);
kfree(journal);
return NULL;
@@ -1677,6 +1703,7 @@ recovery_error:
int jbd2_journal_destroy(journal_t *journal)
{
int err = 0;
+ int i;
/* Wait for the commit thread to wake up and die. */
journal_kill_thread(journal);
@@ -1720,6 +1747,8 @@ int jbd2_journal_destroy(journal_t *journal)
if (journal->j_chksum_driver)
crypto_free_shash(journal->j_chksum_driver);
kfree(journal->j_wbuf);
+ for (i = 0; i < MAX_JBD2_MONITOR_BUFFER; i++)
+ kfree(journal->j_monitor_buffer[i]);
kfree(journal);
return err;
============
fs/jbd2/transaction.c
@@ -29,6 +29,9 @@
#include <linux/backing-dev.h>
#include <linux/bug.h>
#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
#include <trace/events/jbd2.h>
@@ -181,6 +184,7 @@ static int add_transaction_credits(journal_t *journal, int blocks,
transaction_t *t = journal->j_running_transaction;
int needed;
int total = blocks + rsv_blocks;
+ u64 calltime, time_monitor, rettime;
/*
* If the current transaction is locked down for commit, wait
@@ -220,12 +224,17 @@ static int add_transaction_credits(journal_t *journal, int blocks,
* in the new transaction.
*/
if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
+ calltime = sched_clock();
atomic_sub(total, &t->t_outstanding_credits);
read_unlock(&journal->j_state_lock);
write_lock(&journal->j_state_lock);
if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
__jbd2_log_wait_for_space(journal);
write_unlock(&journal->j_state_lock);
+ rettime = sched_clock();
+ time_monitor = rettime - calltime;
+ /* We're going to collect some data to monitor */
+ jbd2_mon_collect_atc_stat(journal, time_monitor);
return 1;
}
@@ -786,11 +795,14 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
char *frozen_buffer = NULL;
int need_copy = 0;
unsigned long start_lock, time_lock;
+ u64 calltime, time_monitor, rettime;
if (is_handle_aborted(handle))
return -EROFS;
journal = transaction->t_journal;
+ calltime = sched_clock();
+
jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
JBUFFER_TRACE(jh, "entry");
@@ -993,6 +1005,13 @@ done:
jbd2_journal_cancel_revoke(handle, jh);
out:
+ /* Compute the time-delay */
+ rettime = sched_clock();
+ time_monitor = rettime - calltime;
+ /* This is the trace point for systemtap scripts */
+ trace_jbd2_write_access_delay(jh2bh(jh), current, time_monitor);
+ /* We're going to collect some data to monitor */
+ jbd2_mon_collect_dgwa_stat(journal, jh2bh(jh), current, time_monitor);
if (unlikely(frozen_buffer)) /* It's usually NULL */
jbd2_free(frozen_buffer, bh->b_size);
============
include/linux/jbd2.h
@@ -684,6 +684,26 @@ jbd2_time_diff(unsigned long start, unsigned long end)
#define JBD2_NR_BATCH 64
+/* Monitoring the fs-delay for per process in the manner of jbd2-operations*/
+struct jbd2_delay_stat {
+#define MAX_NUM_LEVEL 9
+ dev_t dev;
+ pid_t pid;
+ long long time_delay;
+ spinlock_t init_lock;
+ spinlock_t show_lock;
+ u64 count[MAX_NUM_LEVEL];
+};
+
+extern struct jbd2_delay_stat jbd2_write_access_buffer;
+
+extern void jbd2_mon_proc_init(struct proc_dir_entry *parent, journal_t *journal);
+extern void jbd2_mon_proc_exit(struct proc_dir_entry *parent);
+extern void jbd2_mon_buffer_init(struct jbd2_delay_stat *buffer);
+extern void jbd2_mon_collect_dgwa_stat(struct journal_s *journal, struct buffer_head *bh,
+ struct task_struct *ts, u64 time_monitor);
+extern void jbd2_mon_collect_atc_stat(struct journal_s *journal, u64 time_monitor);
+
/**
* struct journal_s - The journal_s type is the concrete type associated with
* journal_t.
@@ -994,6 +1014,10 @@ struct journal_s
/* Precomputed journal UUID checksum for seeding other checksums */
__u32 j_csum_seed;
+
+ /* Storing statistics to print monitor result */
+#define MAX_JBD2_MONITOR_BUFFER 2
+ struct jbd2_delay_stat *j_monitor_buffer[MAX_JBD2_MONITOR_BUFFER];
};
/*
============
fs/jbd2/time_delay_monitor.c
#include <linux/jbd2.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/spinlock.h>
enum jbd2_mon_func_map {
atc,
dgwa,
};
enum time_count_t {
T_0_50,
T_50_100,
T_100_200,
T_200_500,
T_500_1k,
T_1k_5k,
T_5k_10k,
T_10k_100k,
T_100k_BIGGER,
T_COUNT,
};
/* XXX_show() function is used to print classification info */
static int jbd2_mon_seq_show(struct seq_file *s, struct jbd2_delay_stat *buffer)
{
u64 count[T_COUNT];
spin_lock(&buffer->show_lock);
memcpy(count, buffer->count, sizeof(count));
spin_unlock(&buffer->show_lock);
seq_printf(s, "0-50us: \t%llu\n", count[T_0_50]);
seq_printf(s, "50-100us: \t%llu\n", count[T_50_100]);
seq_printf(s, "100us-200us: \t%llu\n", count[T_100_200]);
seq_printf(s, "200-500us: \t%llu\n", count[T_200_500]);
seq_printf(s, "500-1000us: \t%llu\n", count[T_500_1k]);
seq_printf(s, "1-5ms: \t\t%llu\n", count[T_1k_5k]);
seq_printf(s, "5-10ms: \t%llu\n", count[T_5k_10k]);
seq_printf(s, "10-100ms: \t%llu\n", count[T_10k_100k]);
seq_printf(s, "100ms-bigger: \t%llu\n", count[T_100k_BIGGER]);
seq_putc(s, '\n');
return 0;
}
static void jbd2_mon_do_init_count(u64 time,
struct jbd2_delay_stat *buffer)
{
enum time_count_t index;
time = time / 1000;
if (time < 50)
index = T_0_50;
else if (time < 100)
index = T_50_100;
else if (time < 200)
index = T_100_200;
else if (time < 500)
index = T_200_500;
else if (time < 1000)
index = T_500_1k;
else if (time < 5 * 1000)
index = T_1k_5k;
else if (time < 10 * 1000)
index = T_5k_10k;
else if (time < 100 * 1000)
index = T_10k_100k;
else
index = T_100k_BIGGER;
buffer->count[index]++;
}
/* Interface:
* This function can be applied to classify all the processess */
static void jbd2_mon_buffer_count_init(u64 time,
struct jbd2_delay_stat *buffer)
{
return jbd2_mon_do_init_count(time, buffer);
}
/* Interface:
* Init jbd2_delay_stat structure */
static void jbd2_mon_buffer_process_init(struct buffer_head *bh,
struct task_struct *ts, u64 time_monitor,
struct jbd2_delay_stat *buffer)
{
buffer->dev = bh->b_bdev->bd_dev;
buffer->pid = ts->pid;
buffer->time_delay = time_monitor;
}
static void jbd2_mon_collect_stat(struct buffer_head *bh, struct task_struct *ts,
u64 time_monitor, struct jbd2_delay_stat *buffer)
{
spin_lock(&buffer->init_lock);
if (bh != NULL && ts != NULL)
jbd2_mon_buffer_process_init(bh, ts, time_monitor, buffer);
jbd2_mon_buffer_count_init(time_monitor, buffer);
spin_unlock(&buffer->init_lock);
}
static int jbd2_mon_seq_open(struct inode *inode, struct file *file,
enum jbd2_mon_func_map index, struct seq_operations *ops)
{
journal_t *journal = PDE_DATA(inode);
struct jbd2_delay_stat *buffer;
int rc;
buffer = journal->j_monitor_buffer[index];
rc = seq_open(file, ops);
if (rc == 0) {
struct seq_file *m = file->private_data;
m->private = buffer;
}
return rc;
}
/* The first monitoring point:
* Monitoring add_transaction_credits() function for classification */
static void *jbd2_mon_seq_atc_start(struct seq_file *s, loff_t *pos)
{
return *pos ? NULL : SEQ_START_TOKEN;
}
static void jbd2_mon_seq_atc_stop(struct seq_file *s, void *pos)
{
}
static void *jbd2_mon_seq_atc_next(struct seq_file *s, void *v, loff_t *pos)
{
return NULL;
}
static int jbd2_mon_seq_atc_show(struct seq_file *s, void *v)
{
struct jbd2_delay_stat *buffer = s->private;
if (v != SEQ_START_TOKEN)
goto out;
jbd2_mon_seq_show(s, buffer);
out:
return 0;
}
static struct seq_operations jbd2_mon_seq_atc_ops = {
.start = jbd2_mon_seq_atc_start,
.stop = jbd2_mon_seq_atc_stop,
.next = jbd2_mon_seq_atc_next,
.show = jbd2_mon_seq_atc_show
};
static int jbd2_mon_seq_atc_open(struct inode *inode, struct file *file)
{
enum jbd2_mon_func_map index = atc;
return jbd2_mon_seq_open(inode, file, index, &jbd2_mon_seq_atc_ops);
}
static const struct file_operations jbd2_mon_seq_atc_fops = {
.open = jbd2_mon_seq_atc_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
/* Called by add_transaction_credits() function in journal.c
*
* Collect the statistic through monitoring add_transaction_credits() time delay
* We are able to collect data from other functions in different files */
void jbd2_mon_collect_atc_stat(struct journal_s *journal, u64 time_monitor)
{
struct jbd2_delay_stat *buffer = NULL;
enum jbd2_mon_func_map index = atc;
buffer = journal->j_monitor_buffer[index];
jbd2_mon_collect_stat(NULL, NULL, time_monitor, buffer);
}
/* The second monitoring point:
* Monitoring do_get_write_access() function for classification */
static void *jbd2_mon_seq_dgwa_start(struct seq_file *s, loff_t *pos)
{
return *pos ? NULL : SEQ_START_TOKEN;
}
static void jbd2_mon_seq_dgwa_stop(struct seq_file *s, void *pos)
{
}
static void *jbd2_mon_seq_dgwa_next(struct seq_file *s, void *v, loff_t *pos)
{
return NULL;
}
static int jbd2_mon_seq_dgwa_show(struct seq_file *s, void *v)
{
struct jbd2_delay_stat *buffer = s->private;
if (v != SEQ_START_TOKEN)
goto out;
jbd2_mon_seq_show(s, buffer);
out:
return 0;
}
static struct seq_operations jbd2_mon_seq_dgwa_ops = {
.start = jbd2_mon_seq_dgwa_start,
.stop = jbd2_mon_seq_dgwa_stop,
.next = jbd2_mon_seq_dgwa_next,
.show = jbd2_mon_seq_dgwa_show
};
static int jbd2_mon_seq_dgwa_open(struct inode *inode, struct file *file)
{
enum jbd2_mon_func_map index = dgwa;
return jbd2_mon_seq_open(inode, file, index, &jbd2_mon_seq_dgwa_ops);
}
static const struct file_operations jbd2_mon_seq_dgwa_fops = {
.open = jbd2_mon_seq_dgwa_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
/* Called by do_get_write_access() function in journal.c
*
* Collect the statistic through monitoring do_get_write_access() time delay
* We are able to collect data from other functions in different files */
void jbd2_mon_collect_dgwa_stat(struct journal_s *journal, struct buffer_head *bh,
struct task_struct *ts, u64 time_monitor)
{
struct jbd2_delay_stat *buffer = NULL;
enum jbd2_mon_func_map index = dgwa;
buffer = journal->j_monitor_buffer[index];
jbd2_mon_collect_stat(bh, ts, time_monitor, buffer);
}
void jbd2_mon_buffer_init(struct jbd2_delay_stat *buffer)
{
int i = 0;
spin_lock_init(&buffer->init_lock);
spin_lock_init(&buffer->show_lock);
for (i = 0; i < MAX_NUM_LEVEL; i++)
buffer->count[i] = 0;
}
/* Called by journal_init() function in journal.c when loading jbd2 module */
void jbd2_mon_proc_init(struct proc_dir_entry *parent, journal_t *journal)
{
proc_create_data("fsdelay_do_get_write_access", 0, parent,
&jbd2_mon_seq_dgwa_fops, journal);
proc_create_data("fsdelay_add_transaction_credits", 0, parent,
&jbd2_mon_seq_atc_fops, journal);
}
void jbd2_mon_proc_exit(struct proc_dir_entry *parent)
{
remove_proc_entry("fsdelay_do_get_write_access", parent);
remove_proc_entry("fsdelay_add_transaction_credits", parent);
}
===systemtap部分====
include/trace/events/jbd2.h
@@ -10,6 +10,28 @@
struct transaction_chp_stats_s;
struct transaction_run_stats_s;
+TRACE_EVENT(jbd2_write_access_delay,
+
+ TP_PROTO(struct buffer_head *bh, struct task_struct *thread, long long time_monitor),
+
+ TP_ARGS(bh, thread, time_monitor),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( pid_t, pid)
+ __field( long long, time_monitor)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bh->b_bdev->bd_dev;
+ __entry->pid = thread->pid;
+ __entry->time_monitor = time_monitor;
+ ),
+
+ TP_printk("dev %d,%d pid %d time %lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->pid, __entry->time_monitor)
+);
+
TRACE_EVENT(jbd2_checkpoint,
TP_PROTO(journal_t *journal, int result),