Hi All, I have a line that gets called approxmiately 2 trillion times according to valgrind, and I love any suggestions for speeding it up. I have taken over the project from someone else and my C abilities are only intermediate. I've attached a trimmed down version of the function. Here are some notes about the code: * tsMemAlloc and tsMemFree are basically wrappers to malloc/free. * dsfmt_genrand_close_open() generates a number in the range [0,1) And notes about the data: * n_topics is 35 (loaded from a configuration file; changeable) * n_tokens is about 7M (and increases as the data set grows) * params->iter is 500 * topics->wcount size is about 190k items * topics->dcount size is about 275k items * topics->tcount size is 35 items I'm compling on an Opteron, with options: -03 -std=gnu99 -march=native -ffast- math -ftree-loop-distribution -ftree-loop-linear -ftree-interchange -floop- block -ftree-vectorizer-verbose=8 I'm using GCC 4.4.3. The tree vectorizer spits out this note about the loop I'm particularly interested in: for (int k = 1; k < n_topics; k++) cdf[k] = cdf[k-1] + (dcount[k+d_offset] + alpha)*(wcount[k+w_offset + beta)/(tcount[k] + wbeta); note: not vectorized: data ref analysis failed D.8597_92 = *D.8596_91; I've done hours of googling, playing with restrict keywords, splitting the cdf[k-1] addition into another loop, and nothing will help. The error message itself is very poor, as I can't find a decent explanation anywhere online as to what it means nor how to fix it. Thanks for whatever insight you can give! -Mark
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> #include <math.h> #include "dSFMT.h" #include "topicsort.h" #include "sort.h" #include "memory.h" // Initializes the system by randomly generating topic assignments for all of the words in the // dataset. It's assumed that all of the required memory has already been allocated. void randomInit(TSTopics *topics, unsigned int *tokens, dsfmt_t *prng) { const char msg[] = "\r - Choosing random topics...%d%%"; printf(" - Choosing random topics..."); fflush(stdout); const unsigned int n_tokens = topics->n_tokens; for (int i = 0; i < n_tokens; i++) { // Current token word and document unsigned int w = tokens[i]; unsigned int d = tokens[i + n_tokens]; // Generate a random topic and assign it to a word int t = (int)(topics->n_topics*dsfmt_genrand_close_open(prng)); topics->assign[i] = t; ++topics->wcount[t + w*topics->n_topics]; ++topics->dcount[t + d*topics->n_topics]; ++topics->tcount[t]; //too slow on a large dataset //printf(msg, 100*((i+1)/n_tokens)); } printf("\r - Choosing random topics...done\n"); } // Perfrom the LDA operation void tsTopicLDA(TSTopics * topics, TSParams * params, TSCorpus * corpus) { // Initialize the pseudo-random number generator (PRNG) dsfmt_t prng; dsfmt_init_gen_rand(&prng, params->seed); // Initialize the solver printf(" - Initializing tokens\n"); randomInit(topics, corpus->tokens, &prng); // Stores the estimated CDF double * cdf; tsMemAlloc((void **)&cdf, sizeof(double)*params->topics); // Dereference all of the pointers (most are array pointers anyway so this can be done) const unsigned int n_tokens = corpus->n_tokens; const unsigned int n_topics = topics->n_topics; const double alpha = params->alpha; const double beta = params->beta; const double wbeta = (double)corpus->n_words*params->beta; unsigned int * wrd_tokens = &corpus->tokens[0]; unsigned int * doc_tokens = &corpus->tokens[n_tokens]; unsigned int * restrict wcount = topics->wcount; unsigned int * restrict dcount = topics->dcount; unsigned int * restrict tcount = topics->tcount; unsigned int * restrict assign = topics->assign; printf(" - Gibbs sampler burn-in period set to %d iterations\n", params->iter); const char msg[] = "\r - Running iteration %d of %d"; // Setup the timers (for metric purposes) struct timeval start; struct timeval end; struct timeval diff; diff.tv_sec = 0; diff.tv_usec = 0; // Run the Gibbs Sampler for the given burn-in duration for (int i = 0; i < params->iter; i++) { printf(msg, i+1, params->iter); fflush(stdout); // Record the start of the iteration gettimeofday(&start, NULL); // Sample each word for (int j = 0; j < corpus->n_tokens; j++) { const unsigned int w = wrd_tokens[j]; const unsigned int d = doc_tokens[j]; const unsigned int t = assign[j]; const unsigned int w_offset = w*n_topics; const unsigned int d_offset = d*n_topics; double * restrict rcdf = cdf; // Decrement the counters for the current word to remove its influence --wcount[t + w_offset]; --dcount[t + d_offset]; --tcount[t]; // Infer the CDF from the current state //cdf[0] = (dcount[d_offset] + alpha)*(wcount[w_offset] + beta)/(tcount[0] + wbeta); //for (int k = 1; k < n_topics; k++) // cdf[k] = cdf[k-1] + (dcount[k+d_offset] + alpha)*(wcount[k+w_offset] + beta)/(tcount[k] + wbeta); for (int k = 0; k < n_topics; k++) rcdf[k] = (dcount[k+d_offset] + alpha)*(wcount[k+w_offset] + beta)/(tcount[k] + wbeta); for (int k = 1; k < n_topics; k++) rcdf[k] += cdf[k-1]; // Choose the most likely topic assignment based on the CDF double u = cdf[n_topics-1]*dsfmt_genrand_close_open(&prng); int new_topic = -1; for (int k = 0; k < n_topics; k++) if (rcdf[k] > u) { new_topic = k; break; } assign[j] = new_topic; // Increment the appropriate counts ++dcount[new_topic + d_offset]; ++wcount[new_topic + w_offset]; ++tcount[new_topic]; } // Record the end of the iteration gettimeofday(&end, NULL); diff.tv_sec += end.tv_sec - start.tv_sec; diff.tv_usec += end.tv_usec - start.tv_usec; } // Free unused resources tsMemFree(cdf); // Output the time it took to run the analysis double time = 1000.0*diff.tv_sec + 0.001*diff.tv_usec; printf("\n - Finished LDA in %g s (avg %g ms per iteration)\n", time/1000.0, time/params->iter); } // Fill the document ranks structure void tsTopicDocRanks(TSRanks *ranks, TSTopics *topics, double alpha) { /* const char msg[] = "\r - Index for document %d of %d"; struct timeval start; struct timeval end; struct timeval diff; diff.tv_sec = 0; diff.tv_usec = 0; for (unsigned int i = 0; i < topics->n_docs; i++) { unsigned int *ind = &ranks->indices[i*ranks->n_ranks]; double *val = &ranks->values[i*ranks->n_ranks]; gettimeofday(&start, NULL); tsTopicRankSimilar(ind, val, ranks->n_ranks, topics, i, alpha); gettimeofday(&end, NULL); printf(msg, i+1, topics->n_docs); fflush(stdout); diff.tv_sec += end.tv_sec - start.tv_sec; diff.tv_usec += end.tv_usec - start.tv_usec; } double time = 1000.0*diff.tv_sec + 0.001*diff.tv_usec; printf("\n - Finished generating index in %g s (avg %g ms per doc)\n", time/1000.0, time/topics->n_docs);*/ } // Fills the likelihood matrices void tsTopicLikelihoods(TSTopics *topics, double alpha, double beta) { const char doc_msg[] = "\r - Likelihood matrix for document %d of %d"; const char wrd_msg[] = "\r - Likelihood matrix for topic %d of %d"; struct timeval start; struct timeval end; struct timeval diff; diff.tv_sec = 0; diff.tv_usec = 0; // Calculate the document-topic matrix for (unsigned int d = 0; d < topics->n_docs; d++) { gettimeofday(&start, NULL); unsigned int sum = 0; unsigned int *row = &topics->dcount[d*topics->n_topics]; double *theta = &topics->theta[d*topics->n_topics]; for (int t = 0; t < topics->n_topics; t++) sum += row[t]; double den = (double)sum + alpha*topics->n_topics; for (int t = 0; t < topics->n_topics; t++) theta[t] = ((double)row[t] + alpha)/den; gettimeofday(&end, NULL); printf(doc_msg, d+1, topics->n_docs); fflush(stdout); diff.tv_sec += end.tv_sec - start.tv_sec; diff.tv_usec += end.tv_usec - start.tv_usec; } double doc_time = 1000.0*diff.tv_sec + 0.001*diff.tv_usec; printf("\n - Finished document-topic matrix in %g s (avg %g ms per doc)\n", doc_time/1000.0, doc_time/topics->n_docs); // If the number of words is zero, don't find the phi matrix (some cases don't need the // word-topic distribution to be calculated. if (topics->n_words == 0) return; diff.tv_sec = 0; diff.tv_usec = 0; // Calculate the topic-word matrix for (unsigned int t = 0; t < topics->n_topics; t++) { gettimeofday(&start, NULL); double *phi = &topics->phi[t*topics->n_words]; double den = (double)topics->tcount[t] + beta*topics->n_words; for (int w = 0; w < topics->n_words; w++) phi[w] = ((double)topics->wcount[t + w*topics->n_topics] + beta)/den; gettimeofday(&end, NULL); printf(wrd_msg, t+1, topics->n_topics); fflush(stdout); diff.tv_sec += end.tv_sec - start.tv_sec; diff.tv_usec += end.tv_usec - start.tv_usec; } double wrd_time = 1000.0*diff.tv_sec + 0.001*diff.tv_usec; printf("\n - Finished topic-word matrix in %g s (avg %g ms per topic)\n", wrd_time/1000.0, wrd_time/topics->n_docs); printf(" - Finish both matrices in %g s\n", (doc_time + wrd_time)/1000.0); } // Compute the word likelihoods double *tsTopicWordDist(TSTopics *topics, unsigned int t) { if (t < 0 || t >= topics->n_topics) return NULL; return &topics->phi[t*topics->n_words]; } // Return a topic likelihood vector for a given document double *tsTopicDocDist(TSTopics *topics, unsigned int d) { if (d < 0 || d >= topics->n_docs) return NULL; return &topics->theta[d*topics->n_topics]; } // Compute the overall topic mixing parameters double *tsTopicMixingParam(TSTopics *topics) { double *params; //= (double *)malloc(sizeof(double)*topics->n_topics); tsMemAlloc((void **)¶ms, sizeof(double)*topics->n_topics); unsigned int sum = 0; for (int i = 0; i < topics->n_topics; i++) sum += topics->tcount[i]; for (int i = 0; i < topics->n_topics; i++) params[i] = ((double)topics->tcount[i])/sum; return params; } // Ranks documents by similarity using symmetric KL divergence /// @todo Optimize the way that values are added into the list (should work for now but will need to be improved) void tsTopicRankSimilar(unsigned int *list, double *meas, unsigned int N, TSTopics *topics, unsigned int d, double alpha) { double *kl = meas; if (!meas) tsMemAlloc((void **)&kl, sizeof(double)*N); // Find the likelihoods for this particular topic and insert into the list double *p_x = tsTopicDocDist(topics, d); // Find sKL(X,Y) unsigned int num_ins = 0; for (int i = 0; i < topics->n_docs; i++) { double div = 0; // Find the likelihoods for the document to compare against double *p_y = tsTopicDocDist(topics, i); for (int j = 0; j < topics->n_topics; j++) div += p_x[j]*log(p_x[j]/p_y[j]) + p_y[j]*log(p_y[j]/p_x[j]); // Handle the empty-list case if (num_ins == 0) { kl[0] = div; list[0] = i; num_ins++; continue; } // Run through each element and try to find out where to insert it int ptr = -1; for (int j = 0; j < num_ins; j++) if (kl[j] > div) { ptr = j; break; } // Pointer wasn't assigned if (ptr < 0) { kl[num_ins] = div; list[num_ins] = i; num_ins = (num_ins < N) ? num_ins + 1 : N; continue; } // Pointer was assigned so shift everything down for (int j = N-1; j >= ptr; j--) { kl[j] = kl[j-1]; list[j] = list[j-1]; } kl[ptr] = div; list[ptr] = i; num_ins = (num_ins < N) ? num_ins + 1 : N; } if (!meas) tsMemFree(kl); } // Utility Functions // // Initialize the LDA parameter structure. void tsTopicInitParams(TSParams *params, unsigned int N, unsigned int T, unsigned int S) { params->iter = N; params->topics = T; params->seed = S; params->alpha = 5.0/(double)T; params->beta = 0.01; } // Corpus Structure // // Allocates the memory required for the corpus. void tsTopicAllocCorpus(TSCorpus *corpus, unsigned int words, unsigned int words_total, unsigned int docs, unsigned int tokens, unsigned int cutoff) { tsMemAlloc((void **)&corpus->tokens, sizeof(unsigned int)*tokens*2); tsMemAlloc((void **)&corpus->fwd_lut, sizeof(unsigned int)*(words+1)); tsMemAlloc((void **)&corpus->rev_lut, sizeof(unsigned int)*(words_total+1)); corpus->n_tokens = tokens; corpus->n_words_all = words_total; corpus->n_words = words; corpus->n_docs = docs; corpus->cutoff = cutoff; } // Frees the memory associated with the corpus. void tsTopicFreeCorpus(TSCorpus *corpus) { corpus->n_words = 0; corpus->n_docs = 0; // tsMemFree(corpus->occur); tsMemFree(corpus->tokens); tsMemFree(corpus->fwd_lut); tsMemFree(corpus->rev_lut); } // Topics Structure // // Allocates the memory required to store the LDA results. void tsTopicAlloc(TSTopics *results, unsigned int topics, unsigned int tokens, unsigned int words, unsigned int docs) { results->n_topics = topics; results->n_words = words; results->n_docs = docs; results->n_tokens = tokens; size_t sz_words = sizeof(unsigned int)*tokens; size_t sz_wcount = sizeof(unsigned int)*topics*words; size_t sz_dcount = sizeof(unsigned int)*topics*(docs+1); size_t sz_tcount = sizeof(unsigned int)*topics; size_t sz_phi = sizeof(double)*topics*words; size_t sz_theta = sizeof(double)*topics*docs; size_t sz_revlut = sizeof(unsigned int)*(docs+1); // Allocate the memory tsMemAlloc((void **)&results->assign, sz_words); tsMemAlloc((void **)&results->wcount, sz_wcount); tsMemAlloc((void **)&results->dcount, sz_dcount); tsMemAlloc((void **)&results->tcount, sz_tcount); tsMemAlloc((void **)&results->phi, sz_phi); tsMemAlloc((void **)&results->theta, sz_theta); tsMemAlloc((void **)&results->rev_lut, sz_revlut); } // Frees the memory for the particular topics structure. void tsTopicFree(TSTopics *results) { tsMemFree(results->assign); tsMemFree(results->wcount); tsMemFree(results->dcount); tsMemFree(results->tcount); tsMemFree(results->phi); tsMemFree(results->theta); results->n_words = 0; results->n_topics = 0; results->n_docs = 0; }