From 7fea96c04f87a02efc27207629d13f9b2a14ac55 Mon Sep 17 00:00:00 2001 From: behlendo Date: Mon, 21 Apr 2008 22:44:11 +0000 Subject: [PATCH] More fixes to ensure we get good debug logs even if we're in the process of destroying the stacks. Threshhold set fairly aggressively top 80% of stack usage. git-svn-id: https://outreach.scidac.gov/svn/spl/trunk@82 7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c --- include/sys/debug.h | 52 ++++++++++++++++++---------------- modules/spl/spl-debug.c | 62 ++++++++++++++++++++++------------------- modules/spl/spl-proc.c | 2 +- 3 files changed, 63 insertions(+), 53 deletions(-) diff --git a/include/sys/debug.h b/include/sys/debug.h index 720e4136e..8f3fcd94b 100644 --- a/include/sys/debug.h +++ b/include/sys/debug.h @@ -64,6 +64,16 @@ extern unsigned int spl_debug_stack; #define SPL_DEFAULT_MIN_DELAY ((HZ + 1) / 2) #define SPL_DEFAULT_BACKOFF 2 +#define DL_NOTHREAD 0x0001 /* Do not create a new thread */ +#define DL_SINGLE_CPU 0x0002 /* Collect pages from this CPU */ + +typedef struct dumplog_priv { + wait_queue_head_t dp_waitq; + pid_t dp_pid; + int dp_flags; + atomic_t dp_done; +} dumplog_priv_t; + typedef struct { unsigned long cdls_next; int cdls_count; @@ -147,7 +157,7 @@ struct page_collection { int pc_want_daemon_pages; }; -#define SBUG() spl_debug_bug(__FILE__, __FUNCTION__, __LINE__); +#define SBUG() spl_debug_bug(__FILE__, __FUNCTION__, __LINE__, 0); #ifdef __ia64__ #define CDEBUG_STACK() (THREAD_SIZE - \ @@ -159,29 +169,24 @@ struct page_collection { (THREAD_SIZE - 1))) # endif /* __ia64__ */ +/* DL_NOTHREAD and DL_SINGLE_CPU flags are passed to spl_debug_bug() + * because we have over run our stack and likely damaged at least one + * other unknown threads stack. We must finish generating the needed + * debug info within this thread context because once we yeild the CPU + * its very likely the system will crash. + */ #define __CHECK_STACK(file, func, line) \ do { \ unsigned long _stack = CDEBUG_STACK(); \ - unsigned long _soft_limit = (9 * THREAD_SIZE) / 10; \ + unsigned long _soft_limit = (8 * THREAD_SIZE) / 10; \ \ if (unlikely(_stack > _soft_limit && _stack > spl_debug_stack)){\ spl_debug_stack = _stack; \ - if (_stack <= THREAD_SIZE) { \ - spl_debug_msg(NULL, D_TRACE, D_WARNING, \ - file, func, line, "Warning " \ - "exceeded 90%% of maximum safe " \ - "stack size (%lu/%lu)\n", \ - _stack, THREAD_SIZE); \ - spl_debug_dumpstack(NULL); \ - spl_debug_dumplog(); \ - } else { \ - spl_debug_msg(NULL, D_TRACE, D_WARNING, \ - file, func, line, "Error " \ - "exceeded maximum safe stack " \ - "size (%lu/%lu)\n", \ - _stack, THREAD_SIZE); \ - SBUG(); \ - } \ + spl_debug_msg(NULL, D_TRACE, D_WARNING, \ + file, func, line, "Error exceeded " \ + "maximum safe stack size (%lu/%lu)\n", \ + _stack, THREAD_SIZE); \ + spl_debug_bug(file, func, line, DL_SINGLE_CPU); \ } \ } while (0) @@ -213,7 +218,7 @@ do { \ spl_debug_msg(NULL, DEBUG_SUBSYSTEM, D_EMERG, \ __FILE__, __FUNCTION__, __LINE__, \ "ASSERTION(" #cond ") failed\n"); \ - spl_debug_bug(__FILE__, __FUNCTION__, __LINE__); \ + SBUG(); \ } \ } while (0) @@ -226,7 +231,7 @@ do { \ __FILE__, __FUNCTION__, __LINE__, \ "ASSERTION(" #cond ") failed:" fmt, \ ## a); \ - spl_debug_bug(__FILE__, __FUNCTION__, __LINE__) \ + SBUG(); \ } \ } while (0) @@ -242,7 +247,7 @@ do { \ __FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" FMT " " #OP " " FMT ")\n", \ CAST __left, CAST __right); \ - spl_debug_bug(__FILE__, __FUNCTION__, __LINE__); \ + SBUG(); \ } \ } while (0) @@ -285,7 +290,6 @@ do { \ #define CDEBUG_LIMIT(mask, format, a...) \ __CDEBUG_LIMIT(DEBUG_SUBSYSTEM, mask, format, ## a) -#define dprintf(fmt, a...) CDEBUG_LIMIT(D_INFO, fmt, ## a) #define CWARN(fmt, a...) CDEBUG_LIMIT(D_WARNING, fmt, ## a) #define CERROR(fmt, a...) CDEBUG_LIMIT(D_ERROR, fmt, ## a) #define CEMERG(fmt, a...) CDEBUG_LIMIT(D_EMERG, fmt, ## a) @@ -329,9 +333,9 @@ extern unsigned long spl_debug_get_subsys(void); extern int spl_debug_set_mb(int mb); extern int spl_debug_get_mb(void); -extern int spl_debug_dumplog(void); +extern int spl_debug_dumplog(int flags); extern void spl_debug_dumpstack(struct task_struct *tsk); -extern void spl_debug_bug(char *file, const char *func, const int line); +extern void spl_debug_bug(char *file, const char *func, const int line, int flags); extern int spl_debug_clear_buffer(void); extern int spl_debug_mark_buffer(char *text); diff --git a/modules/spl/spl-debug.c b/modules/spl/spl-debug.c index 47d0c854e..8c60ea2de 100644 --- a/modules/spl/spl-debug.c +++ b/modules/spl/spl-debug.c @@ -102,7 +102,7 @@ struct rw_semaphore trace_sem; atomic_t trace_tage_allocated = ATOMIC_INIT(0); static int panic_notifier(struct notifier_block *, unsigned long, void *); -static int spl_debug_dump_all_pages(char *); +static int spl_debug_dump_all_pages(dumplog_priv_t *dp, char *); static void trace_fini(void); @@ -344,12 +344,6 @@ spl_debug_str2mask(unsigned long *mask, const char *str, int is_subsys) return 0; } -typedef struct dumplog_priv { - wait_queue_head_t dp_waitq; - pid_t dp_pid; - atomic_t dp_flag; -} dumplog_priv_t; - static void spl_debug_dumplog_internal(dumplog_priv_t *dp) { @@ -362,7 +356,7 @@ spl_debug_dumplog_internal(dumplog_priv_t *dp) "%s.%ld.%ld", spl_debug_file_path, get_seconds(), (long)dp->dp_pid); printk(KERN_ALERT "SPL: dumping log to %s\n", spl_debug_file_name); - spl_debug_dump_all_pages(spl_debug_file_name); + spl_debug_dump_all_pages(dp, spl_debug_file_name); current->journal_info = journal_info; } @@ -373,29 +367,36 @@ spl_debug_dumplog_thread(void *arg) dumplog_priv_t *dp = (dumplog_priv_t *)arg; spl_debug_dumplog_internal(dp); - atomic_set(&dp->dp_flag, 1); + atomic_set(&dp->dp_done, 1); wake_up(&dp->dp_waitq); do_exit(0); return 0; /* Unreachable */ } +/* When flag is set do not use a new thread for the debug dump */ int -spl_debug_dumplog(void) +spl_debug_dumplog(int flags) { struct task_struct *tsk; dumplog_priv_t dp; - init_waitqueue_head(&dp.dp_waitq); - dp.dp_pid = current->pid; - atomic_set(&dp.dp_flag, 0); + init_waitqueue_head(&dp.dp_waitq); + dp.dp_pid = current->pid; + dp.dp_flags = flags; + atomic_set(&dp.dp_done, 0); - tsk = kthread_create(spl_debug_dumplog_thread,(void *)&dp,"spl_debug"); - if (tsk == NULL) - return -ENOMEM; + if (dp.dp_flags & DL_NOTHREAD) { + spl_debug_dumplog_internal(&dp); + } else { - wake_up_process(tsk); - wait_event(dp.dp_waitq, atomic_read(&dp.dp_flag)); + tsk = kthread_create(spl_debug_dumplog_thread,(void *)&dp,"spl_debug"); + if (tsk == NULL) + return -ENOMEM; + + wake_up_process(tsk); + wait_event(dp.dp_waitq, atomic_read(&dp.dp_done)); + } return 0; } @@ -849,7 +850,7 @@ EXPORT_SYMBOL(spl_debug_vmsg); * some arch, this will have to be implemented separately in each arch. */ static void -panic_collect_pages(struct page_collection *pc) +collect_pages_from_single_cpu(struct page_collection *pc) { struct trace_cpu_data *tcd; int i, j; @@ -876,12 +877,12 @@ collect_pages_on_cpu(void *info) } static void -collect_pages(struct page_collection *pc) +collect_pages(dumplog_priv_t *dp, struct page_collection *pc) { INIT_LIST_HEAD(&pc->pc_pages); - if (spl_panic_in_progress) - panic_collect_pages(pc); + if (spl_panic_in_progress || dp->dp_flags & DL_SINGLE_CPU) + collect_pages_from_single_cpu(pc); else trace_call_on_all_cpus(collect_pages_on_cpu, pc); } @@ -944,7 +945,7 @@ trace_filp_open (const char *name, int flags, int mode, int *err) #define trace_filp_poff(f) (&(f)->f_pos) static int -spl_debug_dump_all_pages(char *filename) +spl_debug_dump_all_pages(dumplog_priv_t *dp, char *filename) { struct page_collection pc; struct file *filp; @@ -965,7 +966,7 @@ spl_debug_dump_all_pages(char *filename) } spin_lock_init(&pc.pc_lock); - collect_pages(&pc); + collect_pages(dp, &pc); if (list_empty(&pc.pc_pages)) { rc = 0; goto close; @@ -1006,13 +1007,18 @@ spl_debug_dump_all_pages(char *filename) static void spl_debug_flush_pages(void) { + dumplog_priv_t dp; struct page_collection pc; struct trace_page *tage; struct trace_page *tmp; spin_lock_init(&pc.pc_lock); + init_waitqueue_head(&dp.dp_waitq); + dp.dp_pid = current->pid; + dp.dp_flags = 0; + atomic_set(&dp.dp_done, 0); - collect_pages(&pc); + collect_pages(&dp, &pc); list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { __ASSERT_TAGE_INVARIANT(tage); list_del(&tage->linkage); @@ -1109,7 +1115,7 @@ void spl_debug_dumpstack(struct task_struct *tsk) } EXPORT_SYMBOL(spl_debug_dumpstack); -void spl_debug_bug(char *file, const char *func, const int line) +void spl_debug_bug(char *file, const char *func, const int line, int flags) { spl_debug_catastrophe = 1; spl_debug_msg(NULL, 0, D_EMERG, file, func, line, "SBUG\n"); @@ -1124,7 +1130,7 @@ void spl_debug_bug(char *file, const char *func, const int line) spl_panic_in_progress = 1; spl_debug_dumpstack(NULL); - spl_debug_dumplog(); + spl_debug_dumplog(flags); if (spl_debug_panic_on_bug) panic("SBUG"); @@ -1168,7 +1174,7 @@ panic_notifier(struct notifier_block *self, while (current->lock_depth >= 0) unlock_kernel(); - spl_debug_dumplog_internal((void *)(long)current->pid); + spl_debug_dumplog(DL_NOTHREAD | DL_SINGLE_CPU); } return 0; diff --git a/modules/spl/spl-proc.c b/modules/spl/spl-proc.c index f3fb793a2..1f97b3667 100644 --- a/modules/spl/spl-proc.c +++ b/modules/spl/spl-proc.c @@ -179,7 +179,7 @@ proc_dump_kernel(struct ctl_table *table, int write, struct file *filp, ENTRY; if (write) { - spl_debug_dumplog(); + spl_debug_dumplog(0); *ppos += *lenp; } else { *lenp = 0;