On Wed, Oct 7, 2009 at 6:58 PM, Holger Kiehl <Holger.Kiehl@xxxxxx> wrote: > On Tue, 6 Oct 2009, Manish Katiyar wrote: > >> On Tue, Oct 6, 2009 at 7:34 PM, Holger Kiehl <Holger.Kiehl@xxxxxx> wrote: >>> >>> Hello >>> >>> Most the time I compile my application without the -g option due to >>> performance reasons. Problem is that when it hits some bug and dumps >>> core, this is not very useful because there is hardly any information >>> in it. Is there some way to get some useful information out of >>> the core file. >> >> Is it possible to post your code ? Atleast the start_process() >> function. Given that you have got a sigsegv it is probably an invalid >> pointer access. >> > The code is GPL so that is no problem. However it is long so I just > cut out start_process() which you will find below. > >> You can also try to print $eip (or rip since this is 64 bit machine) >> and look around the assembly . Output of "disas start_process" from >> gdb will also help. >> > I tried those but I am not familier with assembly: > > (gdb) print $eip > $1 = void > (gdb) print $rip > $2 = (void (*)()) 0x404b5f <start_process+143> > (gdb) where > #0 0x000000304cc32215 in raise (sig=<value optimized out>) > at ../nptl/sysdeps/unix/sysv/linux/raise.c:64 > #1 0x000000304cc33d83 in abort () at abort.c:88 > #2 0x000000000040b174 in sig_segv () > #3 <signal handler called> > #4 0x0000000000404b5f in start_process () > #5 0x0000000000407b9a in main () > (gdb) disas start_process > Dump of assembler code for function start_process: > 0x0000000000404ad0 <start_process+0>: movslq %esi,%rsi > 0x0000000000404ad3 <start_process+3>: mov %rbx,-0x30(%rsp) > 0x0000000000404ad8 <start_process+8>: mov %rbp,-0x28(%rsp) > 0x0000000000404add <start_process+13>: mov %rsi,%r11 > 0x0000000000404ae0 <start_process+16>: mov $0x68,%esi > 0x0000000000404ae5 <start_process+21>: mov %r12,-0x20(%rsp) > 0x0000000000404aea <start_process+26>: imul %rsi,%r11 > 0x0000000000404aee <start_process+30>: mov %r13,-0x18(%rsp) > 0x0000000000404af3 <start_process+35>: mov %r14,-0x10(%rsp) > 0x0000000000404af8 <start_process+40>: mov %r15,-0x8(%rsp) > 0x0000000000404afd <start_process+45>: sub $0x568,%rsp > 0x0000000000404b04 <start_process+52>: mov %rdx,%rbx > 0x0000000000404b07 <start_process+55>: mov %edi,0x24(%rsp) > 0x0000000000404b0b <start_process+59>: mov %r11,%rdi > 0x0000000000404b0e <start_process+62>: add 0x225513(%rip),%rdi > # 0x62a028 <qb> > 0x0000000000404b15 <start_process+69>: cmpb $0x0,0x31(%rdi) > 0x0000000000404b19 <start_process+73>: je 0x404ed8 > <start_process+1032> > 0x0000000000404b1f <start_process+79>: movslq 0x28(%rdi),%rax > 0x0000000000404b23 <start_process+83>: lea 0x0(,%rax,8),%rdx > 0x0000000000404b2b <start_process+91>: mov %rax,%r8 > 0x0000000000404b2e <start_process+94>: shl $0x6,%r8 > 0x0000000000404b32 <start_process+98>: sub %rdx,%r8 > 0x0000000000404b35 <start_process+101>: add 0x2259cc(%rip),%r8 # > 0x62a508 <mdb> > 0x0000000000404b3c <start_process+108>: mov 0x2c(%r8),%r9d > 0x0000000000404b40 <start_process+112>: test %r9d,%r9d > 0x0000000000404b43 <start_process+115>: jne 0x404d70 > <start_process+672> > 0x0000000000404b49 <start_process+121>: movslq 0x24(%rsp),%rax > 0x0000000000404b4e <start_process+126>: imul $0x8f8,%rax,%r14 > 0x0000000000404b55 <start_process+133>: mov %r14,%rax > 0x0000000000404b58 <start_process+136>: add 0x225441(%rip),%rax > # 0x629fa0 <fsa> > 0x0000000000404b5f <start_process+143>: mov 0xec(%rax),%edx > 0x0000000000404b65 <start_process+149>: test $0x1,%dl > 0x0000000000404b68 <start_process+152>: jne 0x404d30 > <start_process+608> > 0x0000000000404b6e <start_process+158>: dec %ecx > 0x0000000000404b70 <start_process+160>: je 0x404bd0 > <start_process+256> > 0x0000000000404b72 <start_process+162>: mov 0xf0(%rax),%ecx > 0x0000000000404b78 <start_process+168>: mov $0x2,%esi > 0x0000000000404b7d <start_process+173>: test %ecx,%ecx > 0x0000000000404b7f <start_process+175>: jne 0x404c88 > <start_process+440> > 0x0000000000404b85 <start_process+181>: test %dl,%dl > 0x0000000000404b87 <start_process+183>: jns 0x404bd0 > <start_process+256> > 0x0000000000404b89 <start_process+185>: mov 0x104(%rax),%ecx > 0x0000000000404b8f <start_process+191>: movslq 0x28(%rdi),%rax > 0x0000000000404b93 <start_process+195>: mov $0xffffffff,%esi > 0x0000000000404b98 <start_process+200>: mov %r11,(%rsp) > 0x0000000000404b9c <start_process+204>: lea 0x0(,%rax,8),%rdx > 0x0000000000404ba4 <start_process+212>: shl $0x6,%rax > 0x0000000000404ba8 <start_process+216>: sub %rdx,%rax > 0x0000000000404bab <start_process+219>: mov 0x225956(%rip),%rdx > # 0x62a508 <mdb> > 0x0000000000404bb2 <start_process+226>: mov 0x28(%rdx,%rax,1),%edi > 0x0000000000404bb6 <start_process+230>: mov %rbx,%rdx > 0x0000000000404bb9 <start_process+233>: callq 0x41ab00 > <check_error_queue> > 0x0000000000404bbe <start_process+238>: test %eax,%eax > 0x0000000000404bc0 <start_process+240>: mov %eax,%esi > 0x0000000000404bc2 <start_process+242>: mov (%rsp),%r11 > 0x0000000000404bc6 <start_process+246>: jne 0x404c88 > <start_process+440> > 0x0000000000404bcc <start_process+252>: nopl 0x0(%rax) > 0x0000000000404bd0 <start_process+256>: mov %r14,%rcx > 0x0000000000404bd3 <start_process+259>: add 0x2253c6(%rip),%rcx > # 0x629fa0 <fsa> > 0x0000000000404bda <start_process+266>: cmpb $0x5,0xba(%rcx) > 0x0000000000404be1 <start_process+273>: je 0x404f88 > <start_process+1208> > 0x0000000000404be7 <start_process+279>: mov 0x225462(%rip),%rax > # 0x62a050 <p_afd_status> > 0x0000000000404bee <start_process+286>: mov 0x225194(%rip),%ecx > # 0x629d88 <max_connections> > 0x0000000000404bf4 <start_process+292>: cmp %ecx,0x4f4(%rax) > 0x0000000000404bfa <start_process+298>: jge 0x404d30 > <start_process+608> > 0x0000000000404c00 <start_process+304>: mov %r14,%r8 > 0x0000000000404c03 <start_process+307>: add 0x225396(%rip),%r8 # > 0x629fa0 <fsa> > 0x0000000000404c0a <start_process+314>: mov 0x174(%r8),%edi > 0x0000000000404c11 <start_process+321>: cmp %edi,0x170(%r8) > 0x0000000000404c18 <start_process+328>: jge 0x404d30 > <start_process+608> > 0x0000000000404c1e <start_process+334>: test %ecx,%ecx > 0x0000000000404c20 <start_process+336>: jle 0x404c5e > <start_process+398> > 0x0000000000404c22 <start_process+338>: mov 0x2251ff(%rip),%rsi > # 0x62---Type <return> to continue, or q <return> to quit---q > > So all I now know is that it happened with the assembly instruction: > > mov 0xec(%rax),%edx > > But what does it tell me. At what part of my code could this be? Hi Holger, I don't have the source code, so a bit hard to guess. But you can try to find out which member of your fsa structure is at offset 236 (0xec) and look around those lines in the function where you are accessing that member. I am trying to download the AFD source code, which looks like it will take ages on my slow broadband. Hopefully I can help after that. > > Thanks, > Holger > > --------- code of start_process() ---------- > static pid_t > start_process(int fsa_pos, int qb_pos, time_t current_time, int retry) > { > pid_t pid = PENDING; > > if ((qb[qb_pos].msg_name[0] != '\0') && > (mdb[qb[qb_pos].pos].age_limit > 0) && > ((fsa[fsa_pos].host_status & DO_NOT_DELETE_DATA) == 0) && > (current_time > qb[qb_pos].creation_time) && > ((current_time - qb[qb_pos].creation_time) > > mdb[qb[qb_pos].pos].age_limit)) > { > char del_dir[MAX_PATH_LENGTH]; > > if (fsa[fsa_pos].host_status & ERROR_QUEUE_SET) > { > remove_from_error_queue(mdb[qb[qb_pos].pos].job_id, &fsa[fsa_pos], > fsa_pos, fsa_fd); > } > (void)sprintf(del_dir, "%s%s%s/%s", > p_work_dir, AFD_FILE_DIR, > OUTGOING_DIR, qb[qb_pos].msg_name); > extract_cus(qb[qb_pos].msg_name, dl.input_time, dl.split_job_counter, > dl.unique_number); > remove_job_files(del_dir, fsa_pos, mdb[qb[qb_pos].pos].job_id, > FD, AGE_OUTPUT, -1); > ABS_REDUCE(fsa_pos); > pid = REMOVED; > } > else > { > int in_error_queue = NEITHER; > > if ((qb[qb_pos].msg_name[0] == '\0') && > (*(unsigned char *)((char *)fsa - AFD_FEATURE_FLAG_OFFSET_END) & > DISABLE_RETRIEVE)) > { > ABS_REDUCE(fsa_pos); > > return(REMOVED); > } > > if (((fsa[fsa_pos].host_status & STOP_TRANSFER_STAT) == 0) && > ((retry == YES) || > ((fsa[fsa_pos].error_counter == 0) && > (((fsa[fsa_pos].host_status & ERROR_QUEUE_SET) == 0) || > ((fsa[fsa_pos].host_status & ERROR_QUEUE_SET) && > ((in_error_queue = > check_error_queue(mdb[qb[qb_pos].pos].job_id, > -1, current_time, > > fsa[fsa_pos].retry_interval)) == NO)))) || > ((fsa[fsa_pos].error_counter > 0) && > (fsa[fsa_pos].host_status & ERROR_QUEUE_SET) && > ((current_time - (fsa[fsa_pos].last_retry_time + > fsa[fsa_pos].retry_interval)) >= 0) && > ((in_error_queue == NO) || > ((in_error_queue == NEITHER) && > (check_error_queue(mdb[qb[qb_pos].pos].job_id, -1, > current_time, > fsa[fsa_pos].retry_interval) == NO)))) || > ((fsa[fsa_pos].active_transfers == 0) && > ((current_time - (fsa[fsa_pos].last_retry_time + > fsa[fsa_pos].retry_interval)) >= 0)))) > { > /* > * First lets try and take an existing process, > * that is waiting for more data to come. > */ > if ((fsa[fsa_pos].original_toggle_pos == NONE) && > ((fsa[fsa_pos].protocol_options & DISABLE_BURSTING) == 0) && > (fsa[fsa_pos].keep_connected > 0) && > (fsa[fsa_pos].active_transfers > 0) && > (fsa[fsa_pos].jobs_queued > 0) && > ((((fsa[fsa_pos].special_flag & KEEP_CON_NO_SEND) == 0) && > (qb[qb_pos].msg_name[0] != '\0')) || > (((fsa[fsa_pos].special_flag & KEEP_CON_NO_FETCH) == 0) && > (qb[qb_pos].msg_name[0] == '\0'))) && > ((qb[qb_pos].special_flag & HELPER_JOB) == 0)) > { > int i, > other_job_wait_pos[MAX_NO_PARALLEL_JOBS], > other_qb_pos[MAX_NO_PARALLEL_JOBS], > wait_counter = 0; > > for (i = 0; i < fsa[fsa_pos].allowed_transfers; i++) > { > if ((fsa[fsa_pos].job_status[i].proc_id != -1) && > (fsa[fsa_pos].job_status[i].unique_name[2] == 5)) > { > int exec_qb_pos; > > qb_pos_pid(fsa[fsa_pos].job_status[i].proc_id, > &exec_qb_pos); > if (exec_qb_pos != -1) > { > if ((qb[qb_pos].msg_name[0] != '\0') && > (qb[exec_qb_pos].msg_name[0] != '\0') && > (mdb[qb[qb_pos].pos].type == > mdb[qb[exec_qb_pos].pos].type) && > (mdb[qb[qb_pos].pos].port == > mdb[qb[exec_qb_pos].pos].port)) > { > if (qb[qb_pos].retries > 0) > { > fsa[fsa_pos].job_status[i].file_name_in_use[0] = > '\0'; > fsa[fsa_pos].job_status[i].file_name_in_use[1] = > 1; > > (void)sprintf(&fsa[fsa_pos].job_status[i].file_name_in_use[2], > "%u", qb[qb_pos].retries); > } > fsa[fsa_pos].job_status[i].job_id = > mdb[qb[qb_pos].pos].job_id; > mdb[qb[qb_pos].pos].last_transfer_time = > mdb[qb[exec_qb_pos].pos].last_transfer_time = current_time; > (void)memcpy(fsa[fsa_pos].job_status[i].unique_name, > qb[qb_pos].msg_name, > MAX_MSG_NAME_LENGTH); > > (void)memcpy(connection[qb[exec_qb_pos].connect_pos].msg_name, > qb[qb_pos].msg_name, > MAX_MSG_NAME_LENGTH); > qb[qb_pos].pid = qb[exec_qb_pos].pid; > qb[qb_pos].connect_pos = qb[exec_qb_pos].connect_pos; > qb[qb_pos].special_flag |= BURST_REQUEUE; > connection[qb[exec_qb_pos].connect_pos].job_no = i; > if (qb[exec_qb_pos].pid > 0) > { > if (kill(qb[exec_qb_pos].pid, SIGUSR1) == -1) > { > system_log(DEBUG_SIGN, __FILE__, __LINE__, > "Failed to send SIGUSR1 to %lld : > %s", > (pri_pid_t)qb[exec_qb_pos].pid, > strerror(errno)); > } > p_afd_status->burst2_counter++; > } > else > { > system_log(DEBUG_SIGN, __FILE__, __LINE__, > "Hmmm, pid = %lld!!!", > (pri_pid_t)qb[exec_qb_pos].pid); > } > if ((fsa[fsa_pos].transfer_rate_limit > 0) || > (no_of_trl_groups > 0)) > { > calc_trl_per_process(fsa_pos); > } > ABS_REDUCE(fsa_pos); > remove_msg(exec_qb_pos); > > return(qb[qb_pos].pid); > } > else > { > other_job_wait_pos[wait_counter] = i; > other_qb_pos[wait_counter] = exec_qb_pos; > wait_counter++; > } > } > else > { > system_log(DEBUG_SIGN, __FILE__, __LINE__, > "Unable to locate qb_pos for %lld > [fsa_pos=%d].", > > (pri_pid_t)fsa[fsa_pos].job_status[i].proc_id, > fsa_pos); > } > } > } > if ((fsa[fsa_pos].active_transfers == > fsa[fsa_pos].allowed_transfers) && > (wait_counter > 0)) > { > for (i = 0; i < wait_counter; i++) > { > if > (fsa[fsa_pos].job_status[other_job_wait_pos[i]].unique_name[2] == 5) > { > if (qb[other_qb_pos[i]].pid > 0) > { > > fsa[fsa_pos].job_status[other_job_wait_pos[i]].unique_name[2] = 6; > if (qb[other_qb_pos[i]].msg_name[0] == '\0') > { > return(PENDING); > } > else > { > if (kill(qb[other_qb_pos[i]].pid, SIGUSR1) == -1) > { > system_log(DEBUG_SIGN, __FILE__, __LINE__, > "Failed to send SIGUSR1 to %lld : > %s", > (pri_pid_t)qb[other_qb_pos[i]].pid, > strerror(errno)); > > fsa[fsa_pos].job_status[other_job_wait_pos[i]].unique_name[2] = 5; > } > else > { > return(PENDING); > } > } > } > else > { > system_log(DEBUG_SIGN, __FILE__, __LINE__, > "Hmmm, pid = %lld!!!", > (pri_pid_t)qb[other_qb_pos[i]].pid); > } > } > } > } > } > > if ((p_afd_status->no_of_transfers < max_connections) && > (fsa[fsa_pos].active_transfers < > fsa[fsa_pos].allowed_transfers)) > { > int pos; > > if ((pos = get_free_connection()) == INCORRECT) > { > system_log(ERROR_SIGN, __FILE__, __LINE__, > "Failed to get free connection."); > } > else > { > if ((connection[pos].job_no = get_free_disp_pos(fsa_pos)) != > INCORRECT) > { > if (qb[qb_pos].msg_name[0] == '\0') > { > connection[pos].fra_pos = qb[qb_pos].pos; > connection[pos].protocol = fra[qb[qb_pos].pos].protocol; > connection[pos].msg_name[0] = '\0'; > (void)memcpy(connection[pos].dir_alias, > fra[qb[qb_pos].pos].dir_alias, > MAX_DIR_ALIAS_LENGTH + 1); > } > else > { > connection[pos].fra_pos = -1; > connection[pos].protocol = mdb[qb[qb_pos].pos].type; > (void)memcpy(connection[pos].msg_name, > qb[qb_pos].msg_name, > MAX_MSG_NAME_LENGTH); > connection[pos].dir_alias[0] = '\0'; > } > if (qb[qb_pos].special_flag & RESEND_JOB) > { > connection[pos].resend = YES; > } > else > { > connection[pos].resend = NO; > } > connection[pos].temp_toggle = OFF; > (void)memcpy(connection[pos].hostname, > fsa[fsa_pos].host_alias, > MAX_HOSTNAME_LENGTH + 1); > connection[pos].host_id = fsa[fsa_pos].host_id; > connection[pos].fsa_pos = fsa_pos; > if (fd_check_fsa() == YES) > { > if (check_fra_fd() == YES) > { > init_fra_data(); > } > > /* > * We need to set the connection[pos].pid to a > * value higher then 0 so the function > get_new_positions() > * also locates the new connection[pos].fsa_pos. > Otherwise > * from here on we point to some completely different > * host and this can cause havoc when someone uses > * edit_hc and changes the alias order. > */ > connection[pos].pid = 1; > get_new_positions(); > connection[pos].pid = 0; > init_msg_buffer(); > fsa_pos = connection[pos].fsa_pos; > last_pos_lookup = INCORRECT; > } > > (void)strcpy(fsa[fsa_pos].job_status[connection[pos].job_no].unique_name, > qb[qb_pos].msg_name); > if ((fsa[fsa_pos].error_counter == 0) && > (fsa[fsa_pos].auto_toggle == ON) && > (fsa[fsa_pos].original_toggle_pos != NONE) && > (fsa[fsa_pos].max_successful_retries > 0)) > { > if ((fsa[fsa_pos].original_toggle_pos == > fsa[fsa_pos].toggle_pos) && > (fsa[fsa_pos].successful_retries > 0)) > { > fsa[fsa_pos].original_toggle_pos = NONE; > fsa[fsa_pos].successful_retries = 0; > } > else if (fsa[fsa_pos].successful_retries >= > fsa[fsa_pos].max_successful_retries) > { > connection[pos].temp_toggle = ON; > fsa[fsa_pos].successful_retries = 0; > } > else > { > fsa[fsa_pos].successful_retries++; > } > } > > /* Create process to distribute file. */ > if ((connection[pos].pid = make_process(&connection[pos], > qb_pos)) > 0) > { > pid = > fsa[fsa_pos].job_status[connection[pos].job_no].proc_id = > connection[pos].pid; > fsa[fsa_pos].active_transfers += 1; > if ((fsa[fsa_pos].transfer_rate_limit > 0) || > (no_of_trl_groups > 0)) > { > calc_trl_per_process(fsa_pos); > } > ABS_REDUCE(fsa_pos); > qb[qb_pos].connect_pos = pos; > p_afd_status->no_of_transfers++; > } > else > { > > fsa[fsa_pos].job_status[connection[pos].job_no].connect_status = > NOT_WORKING; > > fsa[fsa_pos].job_status[connection[pos].job_no].no_of_files = 0; > > fsa[fsa_pos].job_status[connection[pos].job_no].no_of_files_done = 0; > > fsa[fsa_pos].job_status[connection[pos].job_no].file_size = 0; > > fsa[fsa_pos].job_status[connection[pos].job_no].file_size_done = 0; > > fsa[fsa_pos].job_status[connection[pos].job_no].file_size_in_use = 0; > > fsa[fsa_pos].job_status[connection[pos].job_no].file_size_in_use_done = 0; > > fsa[fsa_pos].job_status[connection[pos].job_no].file_name_in_use[0] = '\0'; > > fsa[fsa_pos].job_status[connection[pos].job_no].file_name_in_use[1] = 0; > > fsa[fsa_pos].job_status[connection[pos].job_no].unique_name[0] = '\0'; > connection[pos].hostname[0] = '\0'; > connection[pos].msg_name[0] = '\0'; > connection[pos].host_id = 0; > connection[pos].job_no = -1; > connection[pos].fsa_pos = -1; > connection[pos].fra_pos = -1; > connection[pos].pid = 0; > } > } > } > } > } > } > return(pid); > } > -- Thanks - Manish ================================== [$\*.^ -- I miss being one of them ================================== -- To unsubscribe from this list: send the line "unsubscribe linux-c-programming" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html