How to declare and malloc pointer inside OpenMP parallel region? (ERROR: Segment violation ('core'...












0















I am doing it like:



void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {

int * clusterMemberCount = (int *) calloc (numClusters,sizeof(int));

#pragma omp parallel
{
int ** localClusterMemberCount;
int * activeCluster;
#pragma omp single
{
localClusterMemberCount = (int **) malloc (omp_get_num_threads() * sizeof(int *));
//localClusterMemberCount[0] = (int *) calloc (omp_get_num_threads()*numClusters,sizeof(int));
for (int i = 0; i < omp_get_num_threads(); ++i) {
localClusterMemberCount[i] = calloc (numClusters,sizeof(int));
//localClusterMemberCount[i] = localClusterMemberCount[i-1] + numClusters;
}
activeCluster = (int *) calloc (omp_get_num_threads(),sizeof(int));
}

// sum all points
// for every point
for (int i = 0; i < numObjs; ++i) {
// which cluster is it in?
activeCluster[omp_get_thread_num()] = clusterAssignmentCurrent[i];
// update count of members in that cluster
++localClusterMemberCount[omp_get_thread_num()][activeCluster[omp_get_thread_num()]];
// sum point coordinates for finding centroid
for (int j = 0; j < numCoords; ++j)
#pragma omp atomic
clustersCentroID[activeCluster[omp_get_thread_num()]*numCoords + j] += dataSetMatrix[i*numCoords + j];
}

// now divide each coordinate sum by number of members to find mean/centroid
// for each cluster
for (int i = 0; i < numClusters; ++i) {
if (localClusterMemberCount[omp_get_thread_num()][i] != 0)
// for each numCoordsension
for (int j = 0; j < numCoords; ++j)
#pragma omp atomic
clustersCentroID[i*numCoords + j] /= localClusterMemberCount[omp_get_thread_num()][i]; /// XXXX will divide by zero here for any empty clusters!
}

// free memory
#pragma omp single
{
free (localClusterMemberCount[0]);
free (localClusterMemberCount);
free (activeCluster);
}
}
free(clusterMemberCount);


But I get the error: Segment violation ('core' generated) so I am doing something bad, and I think the error is on mallocing pointers due to I have tried sequential code and it is working fine. Also I have tried parallel code but without mallocs (using globals variables with atomic) and that works fine too. The error only apears when I try to create private pointers and malloc them.



Any idea how can I solve it?










share|improve this question



























    0















    I am doing it like:



    void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {

    int * clusterMemberCount = (int *) calloc (numClusters,sizeof(int));

    #pragma omp parallel
    {
    int ** localClusterMemberCount;
    int * activeCluster;
    #pragma omp single
    {
    localClusterMemberCount = (int **) malloc (omp_get_num_threads() * sizeof(int *));
    //localClusterMemberCount[0] = (int *) calloc (omp_get_num_threads()*numClusters,sizeof(int));
    for (int i = 0; i < omp_get_num_threads(); ++i) {
    localClusterMemberCount[i] = calloc (numClusters,sizeof(int));
    //localClusterMemberCount[i] = localClusterMemberCount[i-1] + numClusters;
    }
    activeCluster = (int *) calloc (omp_get_num_threads(),sizeof(int));
    }

    // sum all points
    // for every point
    for (int i = 0; i < numObjs; ++i) {
    // which cluster is it in?
    activeCluster[omp_get_thread_num()] = clusterAssignmentCurrent[i];
    // update count of members in that cluster
    ++localClusterMemberCount[omp_get_thread_num()][activeCluster[omp_get_thread_num()]];
    // sum point coordinates for finding centroid
    for (int j = 0; j < numCoords; ++j)
    #pragma omp atomic
    clustersCentroID[activeCluster[omp_get_thread_num()]*numCoords + j] += dataSetMatrix[i*numCoords + j];
    }

    // now divide each coordinate sum by number of members to find mean/centroid
    // for each cluster
    for (int i = 0; i < numClusters; ++i) {
    if (localClusterMemberCount[omp_get_thread_num()][i] != 0)
    // for each numCoordsension
    for (int j = 0; j < numCoords; ++j)
    #pragma omp atomic
    clustersCentroID[i*numCoords + j] /= localClusterMemberCount[omp_get_thread_num()][i]; /// XXXX will divide by zero here for any empty clusters!
    }

    // free memory
    #pragma omp single
    {
    free (localClusterMemberCount[0]);
    free (localClusterMemberCount);
    free (activeCluster);
    }
    }
    free(clusterMemberCount);


    But I get the error: Segment violation ('core' generated) so I am doing something bad, and I think the error is on mallocing pointers due to I have tried sequential code and it is working fine. Also I have tried parallel code but without mallocs (using globals variables with atomic) and that works fine too. The error only apears when I try to create private pointers and malloc them.



    Any idea how can I solve it?










    share|improve this question

























      0












      0








      0








      I am doing it like:



      void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {

      int * clusterMemberCount = (int *) calloc (numClusters,sizeof(int));

      #pragma omp parallel
      {
      int ** localClusterMemberCount;
      int * activeCluster;
      #pragma omp single
      {
      localClusterMemberCount = (int **) malloc (omp_get_num_threads() * sizeof(int *));
      //localClusterMemberCount[0] = (int *) calloc (omp_get_num_threads()*numClusters,sizeof(int));
      for (int i = 0; i < omp_get_num_threads(); ++i) {
      localClusterMemberCount[i] = calloc (numClusters,sizeof(int));
      //localClusterMemberCount[i] = localClusterMemberCount[i-1] + numClusters;
      }
      activeCluster = (int *) calloc (omp_get_num_threads(),sizeof(int));
      }

      // sum all points
      // for every point
      for (int i = 0; i < numObjs; ++i) {
      // which cluster is it in?
      activeCluster[omp_get_thread_num()] = clusterAssignmentCurrent[i];
      // update count of members in that cluster
      ++localClusterMemberCount[omp_get_thread_num()][activeCluster[omp_get_thread_num()]];
      // sum point coordinates for finding centroid
      for (int j = 0; j < numCoords; ++j)
      #pragma omp atomic
      clustersCentroID[activeCluster[omp_get_thread_num()]*numCoords + j] += dataSetMatrix[i*numCoords + j];
      }

      // now divide each coordinate sum by number of members to find mean/centroid
      // for each cluster
      for (int i = 0; i < numClusters; ++i) {
      if (localClusterMemberCount[omp_get_thread_num()][i] != 0)
      // for each numCoordsension
      for (int j = 0; j < numCoords; ++j)
      #pragma omp atomic
      clustersCentroID[i*numCoords + j] /= localClusterMemberCount[omp_get_thread_num()][i]; /// XXXX will divide by zero here for any empty clusters!
      }

      // free memory
      #pragma omp single
      {
      free (localClusterMemberCount[0]);
      free (localClusterMemberCount);
      free (activeCluster);
      }
      }
      free(clusterMemberCount);


      But I get the error: Segment violation ('core' generated) so I am doing something bad, and I think the error is on mallocing pointers due to I have tried sequential code and it is working fine. Also I have tried parallel code but without mallocs (using globals variables with atomic) and that works fine too. The error only apears when I try to create private pointers and malloc them.



      Any idea how can I solve it?










      share|improve this question














      I am doing it like:



      void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {

      int * clusterMemberCount = (int *) calloc (numClusters,sizeof(int));

      #pragma omp parallel
      {
      int ** localClusterMemberCount;
      int * activeCluster;
      #pragma omp single
      {
      localClusterMemberCount = (int **) malloc (omp_get_num_threads() * sizeof(int *));
      //localClusterMemberCount[0] = (int *) calloc (omp_get_num_threads()*numClusters,sizeof(int));
      for (int i = 0; i < omp_get_num_threads(); ++i) {
      localClusterMemberCount[i] = calloc (numClusters,sizeof(int));
      //localClusterMemberCount[i] = localClusterMemberCount[i-1] + numClusters;
      }
      activeCluster = (int *) calloc (omp_get_num_threads(),sizeof(int));
      }

      // sum all points
      // for every point
      for (int i = 0; i < numObjs; ++i) {
      // which cluster is it in?
      activeCluster[omp_get_thread_num()] = clusterAssignmentCurrent[i];
      // update count of members in that cluster
      ++localClusterMemberCount[omp_get_thread_num()][activeCluster[omp_get_thread_num()]];
      // sum point coordinates for finding centroid
      for (int j = 0; j < numCoords; ++j)
      #pragma omp atomic
      clustersCentroID[activeCluster[omp_get_thread_num()]*numCoords + j] += dataSetMatrix[i*numCoords + j];
      }

      // now divide each coordinate sum by number of members to find mean/centroid
      // for each cluster
      for (int i = 0; i < numClusters; ++i) {
      if (localClusterMemberCount[omp_get_thread_num()][i] != 0)
      // for each numCoordsension
      for (int j = 0; j < numCoords; ++j)
      #pragma omp atomic
      clustersCentroID[i*numCoords + j] /= localClusterMemberCount[omp_get_thread_num()][i]; /// XXXX will divide by zero here for any empty clusters!
      }

      // free memory
      #pragma omp single
      {
      free (localClusterMemberCount[0]);
      free (localClusterMemberCount);
      free (activeCluster);
      }
      }
      free(clusterMemberCount);


      But I get the error: Segment violation ('core' generated) so I am doing something bad, and I think the error is on mallocing pointers due to I have tried sequential code and it is working fine. Also I have tried parallel code but without mallocs (using globals variables with atomic) and that works fine too. The error only apears when I try to create private pointers and malloc them.



      Any idea how can I solve it?







      c openmp






      share|improve this question













      share|improve this question











      share|improve this question




      share|improve this question










      asked Nov 21 '18 at 12:27









      JuMoGarJuMoGar

      8691218




      8691218
























          1 Answer
          1






          active

          oldest

          votes


















          2














          Two reasons for the segfault:





          • localClusterMemberCount should be a shared variable declared outside of the parallel region, initialized within the parallel region by a single thread. Otherwise, each thread has its own copy of the variable, and for all but the thread that has gone through the singlesection, that points to a random location of the memory.

          • An implicit or explicit barrier is needed before the section of code where pointers are freed. All threads needs to be done for sure before memory can be disallocated, otherwise one thread may free pointers still being used by other threads.


          There are few other issues with the code. See below with my own comments flagged with ***:



          void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {

          int * clusterMemberCount = (int *) calloc (numClusters,sizeof(int));

          /* ***
          * This has to be a shared variable that each thread can access
          * If declared inside the parallel region, it will be a thread-local variable
          * which is left un-initialized for all but one thread. Further attempts to access
          * that variable will lead to segfaults
          */
          int ** localClusterMemberCount;
          #pragma omp parallel shared(localClusterMemberCount,clusterMemberCount)
          {

          // *** Make activeCluster a thread-local variable rather than a shared array (shared array will result in false sharing)
          int activeCluster;
          #pragma omp single
          {
          localClusterMemberCount = (int **) malloc (omp_get_num_threads() * sizeof(int *));
          //localClusterMemberCount[0] = (int *) calloc (omp_get_num_threads()*numClusters,sizeof(int));
          for (int i = 0; i < omp_get_num_threads(); ++i) {
          localClusterMemberCount[i] = calloc (numClusters,sizeof(int));
          //localClusterMemberCount[i] = localClusterMemberCount[i-1] + numClusters;
          }
          }

          // sum all points
          // for every point
          for (int i = 0; i < numObjs; ++i) {
          // which cluster is it in?
          activeCluster = clusterAssignmentCurrent[i];
          // update count of members in that cluster
          ++localClusterMemberCount[omp_get_thread_num()][activeCluster];
          // sum point coordinates for finding centroid

          // *** This may be slower in parallel because of the atomic operation
          for (int j = 0; j < numCoords; ++j)
          #pragma omp atomic
          clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
          }

          /* ***
          * Missing: one reduction step
          * The global cluster member count needs to be updated
          * one option is below :
          */
          #pragma omp critical
          for (int i=0; i < numClusters; ++i) clusterMemberCount+=localClusterMemberCount[omp_get_thread_num()];
          #pragma omp barrier // wait here before moving on



          // *** The code below was wrong; to compute the average, coordinates should be divided by the global count
          // *** Sucessive divisions by local count will fail. Like, 1/(4+6) is not the same as (1/4)/6

          // now divide each coordinate sum by number of members to find mean/centroid
          // for each cluster
          #pragma omp for
          for (int i = 0; i < numClusters; ++i) {
          if (clusterMemberCount != 0)
          // for each numCoordsension
          #pragma omp simd //not sure this will help, the compiler may already vectorize that
          for (int j = 0; j < numCoords; ++j)
          clustersCentroID[i*numCoords + j] /= clusterMemberCount[i]; /// XXXX will divide by zero here for any empty clusters!
          // *** ^^ atomic is not needed
          // *** only one thread will access each value of clusterCentroID

          }

          #pragma omp barrier
          /* ***
          * A barrier is needed otherwise the first thread arriving there will start to free the memory
          * Other threads may still be in the previous loop attempting to access localClusterMemberCount
          * If the pointer has been freed already, this will result in a segfault
          *
          * With the corrected code, the implicit barrier at the end of the distributed
          * for loop would be sufficient. With your initial code, an explicit barrier
          * would have been needed.
          */

          // free memory
          #pragma omp single
          {
          // *** Need to free all pointers and not only the first one
          for (int i = 0; i < omp_get_num_threads(); ++i) free (localClusterMemberCount[i]);
          free (localClusterMemberCount);
          }
          }
          free(clusterMemberCount);





          share|improve this answer
























          • Now the code is working. It lose a lot of troughput due to barriers and criticals, but that is other problem. Declarations and mallocing are working perfectly. Thank you.

            – JuMoGar
            Nov 21 '18 at 13:50











          • How many dimensions numCoords do you have ?

            – Brice
            Nov 21 '18 at 14:43











          • numCoords is 13 and numObjs is 239108

            – JuMoGar
            Nov 21 '18 at 15:00











          Your Answer






          StackExchange.ifUsing("editor", function () {
          StackExchange.using("externalEditor", function () {
          StackExchange.using("snippets", function () {
          StackExchange.snippets.init();
          });
          });
          }, "code-snippets");

          StackExchange.ready(function() {
          var channelOptions = {
          tags: "".split(" "),
          id: "1"
          };
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function() {
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled) {
          StackExchange.using("snippets", function() {
          createEditor();
          });
          }
          else {
          createEditor();
          }
          });

          function createEditor() {
          StackExchange.prepareEditor({
          heartbeatType: 'answer',
          autoActivateHeartbeat: false,
          convertImagesToLinks: true,
          noModals: true,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: 10,
          bindNavPrevention: true,
          postfix: "",
          imageUploader: {
          brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
          contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
          allowUrls: true
          },
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          });


          }
          });














          draft saved

          draft discarded


















          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53412033%2fhow-to-declare-and-malloc-pointer-inside-openmp-parallel-region-error-segment%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown

























          1 Answer
          1






          active

          oldest

          votes








          1 Answer
          1






          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes









          2














          Two reasons for the segfault:





          • localClusterMemberCount should be a shared variable declared outside of the parallel region, initialized within the parallel region by a single thread. Otherwise, each thread has its own copy of the variable, and for all but the thread that has gone through the singlesection, that points to a random location of the memory.

          • An implicit or explicit barrier is needed before the section of code where pointers are freed. All threads needs to be done for sure before memory can be disallocated, otherwise one thread may free pointers still being used by other threads.


          There are few other issues with the code. See below with my own comments flagged with ***:



          void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {

          int * clusterMemberCount = (int *) calloc (numClusters,sizeof(int));

          /* ***
          * This has to be a shared variable that each thread can access
          * If declared inside the parallel region, it will be a thread-local variable
          * which is left un-initialized for all but one thread. Further attempts to access
          * that variable will lead to segfaults
          */
          int ** localClusterMemberCount;
          #pragma omp parallel shared(localClusterMemberCount,clusterMemberCount)
          {

          // *** Make activeCluster a thread-local variable rather than a shared array (shared array will result in false sharing)
          int activeCluster;
          #pragma omp single
          {
          localClusterMemberCount = (int **) malloc (omp_get_num_threads() * sizeof(int *));
          //localClusterMemberCount[0] = (int *) calloc (omp_get_num_threads()*numClusters,sizeof(int));
          for (int i = 0; i < omp_get_num_threads(); ++i) {
          localClusterMemberCount[i] = calloc (numClusters,sizeof(int));
          //localClusterMemberCount[i] = localClusterMemberCount[i-1] + numClusters;
          }
          }

          // sum all points
          // for every point
          for (int i = 0; i < numObjs; ++i) {
          // which cluster is it in?
          activeCluster = clusterAssignmentCurrent[i];
          // update count of members in that cluster
          ++localClusterMemberCount[omp_get_thread_num()][activeCluster];
          // sum point coordinates for finding centroid

          // *** This may be slower in parallel because of the atomic operation
          for (int j = 0; j < numCoords; ++j)
          #pragma omp atomic
          clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
          }

          /* ***
          * Missing: one reduction step
          * The global cluster member count needs to be updated
          * one option is below :
          */
          #pragma omp critical
          for (int i=0; i < numClusters; ++i) clusterMemberCount+=localClusterMemberCount[omp_get_thread_num()];
          #pragma omp barrier // wait here before moving on



          // *** The code below was wrong; to compute the average, coordinates should be divided by the global count
          // *** Sucessive divisions by local count will fail. Like, 1/(4+6) is not the same as (1/4)/6

          // now divide each coordinate sum by number of members to find mean/centroid
          // for each cluster
          #pragma omp for
          for (int i = 0; i < numClusters; ++i) {
          if (clusterMemberCount != 0)
          // for each numCoordsension
          #pragma omp simd //not sure this will help, the compiler may already vectorize that
          for (int j = 0; j < numCoords; ++j)
          clustersCentroID[i*numCoords + j] /= clusterMemberCount[i]; /// XXXX will divide by zero here for any empty clusters!
          // *** ^^ atomic is not needed
          // *** only one thread will access each value of clusterCentroID

          }

          #pragma omp barrier
          /* ***
          * A barrier is needed otherwise the first thread arriving there will start to free the memory
          * Other threads may still be in the previous loop attempting to access localClusterMemberCount
          * If the pointer has been freed already, this will result in a segfault
          *
          * With the corrected code, the implicit barrier at the end of the distributed
          * for loop would be sufficient. With your initial code, an explicit barrier
          * would have been needed.
          */

          // free memory
          #pragma omp single
          {
          // *** Need to free all pointers and not only the first one
          for (int i = 0; i < omp_get_num_threads(); ++i) free (localClusterMemberCount[i]);
          free (localClusterMemberCount);
          }
          }
          free(clusterMemberCount);





          share|improve this answer
























          • Now the code is working. It lose a lot of troughput due to barriers and criticals, but that is other problem. Declarations and mallocing are working perfectly. Thank you.

            – JuMoGar
            Nov 21 '18 at 13:50











          • How many dimensions numCoords do you have ?

            – Brice
            Nov 21 '18 at 14:43











          • numCoords is 13 and numObjs is 239108

            – JuMoGar
            Nov 21 '18 at 15:00
















          2














          Two reasons for the segfault:





          • localClusterMemberCount should be a shared variable declared outside of the parallel region, initialized within the parallel region by a single thread. Otherwise, each thread has its own copy of the variable, and for all but the thread that has gone through the singlesection, that points to a random location of the memory.

          • An implicit or explicit barrier is needed before the section of code where pointers are freed. All threads needs to be done for sure before memory can be disallocated, otherwise one thread may free pointers still being used by other threads.


          There are few other issues with the code. See below with my own comments flagged with ***:



          void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {

          int * clusterMemberCount = (int *) calloc (numClusters,sizeof(int));

          /* ***
          * This has to be a shared variable that each thread can access
          * If declared inside the parallel region, it will be a thread-local variable
          * which is left un-initialized for all but one thread. Further attempts to access
          * that variable will lead to segfaults
          */
          int ** localClusterMemberCount;
          #pragma omp parallel shared(localClusterMemberCount,clusterMemberCount)
          {

          // *** Make activeCluster a thread-local variable rather than a shared array (shared array will result in false sharing)
          int activeCluster;
          #pragma omp single
          {
          localClusterMemberCount = (int **) malloc (omp_get_num_threads() * sizeof(int *));
          //localClusterMemberCount[0] = (int *) calloc (omp_get_num_threads()*numClusters,sizeof(int));
          for (int i = 0; i < omp_get_num_threads(); ++i) {
          localClusterMemberCount[i] = calloc (numClusters,sizeof(int));
          //localClusterMemberCount[i] = localClusterMemberCount[i-1] + numClusters;
          }
          }

          // sum all points
          // for every point
          for (int i = 0; i < numObjs; ++i) {
          // which cluster is it in?
          activeCluster = clusterAssignmentCurrent[i];
          // update count of members in that cluster
          ++localClusterMemberCount[omp_get_thread_num()][activeCluster];
          // sum point coordinates for finding centroid

          // *** This may be slower in parallel because of the atomic operation
          for (int j = 0; j < numCoords; ++j)
          #pragma omp atomic
          clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
          }

          /* ***
          * Missing: one reduction step
          * The global cluster member count needs to be updated
          * one option is below :
          */
          #pragma omp critical
          for (int i=0; i < numClusters; ++i) clusterMemberCount+=localClusterMemberCount[omp_get_thread_num()];
          #pragma omp barrier // wait here before moving on



          // *** The code below was wrong; to compute the average, coordinates should be divided by the global count
          // *** Sucessive divisions by local count will fail. Like, 1/(4+6) is not the same as (1/4)/6

          // now divide each coordinate sum by number of members to find mean/centroid
          // for each cluster
          #pragma omp for
          for (int i = 0; i < numClusters; ++i) {
          if (clusterMemberCount != 0)
          // for each numCoordsension
          #pragma omp simd //not sure this will help, the compiler may already vectorize that
          for (int j = 0; j < numCoords; ++j)
          clustersCentroID[i*numCoords + j] /= clusterMemberCount[i]; /// XXXX will divide by zero here for any empty clusters!
          // *** ^^ atomic is not needed
          // *** only one thread will access each value of clusterCentroID

          }

          #pragma omp barrier
          /* ***
          * A barrier is needed otherwise the first thread arriving there will start to free the memory
          * Other threads may still be in the previous loop attempting to access localClusterMemberCount
          * If the pointer has been freed already, this will result in a segfault
          *
          * With the corrected code, the implicit barrier at the end of the distributed
          * for loop would be sufficient. With your initial code, an explicit barrier
          * would have been needed.
          */

          // free memory
          #pragma omp single
          {
          // *** Need to free all pointers and not only the first one
          for (int i = 0; i < omp_get_num_threads(); ++i) free (localClusterMemberCount[i]);
          free (localClusterMemberCount);
          }
          }
          free(clusterMemberCount);





          share|improve this answer
























          • Now the code is working. It lose a lot of troughput due to barriers and criticals, but that is other problem. Declarations and mallocing are working perfectly. Thank you.

            – JuMoGar
            Nov 21 '18 at 13:50











          • How many dimensions numCoords do you have ?

            – Brice
            Nov 21 '18 at 14:43











          • numCoords is 13 and numObjs is 239108

            – JuMoGar
            Nov 21 '18 at 15:00














          2












          2








          2







          Two reasons for the segfault:





          • localClusterMemberCount should be a shared variable declared outside of the parallel region, initialized within the parallel region by a single thread. Otherwise, each thread has its own copy of the variable, and for all but the thread that has gone through the singlesection, that points to a random location of the memory.

          • An implicit or explicit barrier is needed before the section of code where pointers are freed. All threads needs to be done for sure before memory can be disallocated, otherwise one thread may free pointers still being used by other threads.


          There are few other issues with the code. See below with my own comments flagged with ***:



          void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {

          int * clusterMemberCount = (int *) calloc (numClusters,sizeof(int));

          /* ***
          * This has to be a shared variable that each thread can access
          * If declared inside the parallel region, it will be a thread-local variable
          * which is left un-initialized for all but one thread. Further attempts to access
          * that variable will lead to segfaults
          */
          int ** localClusterMemberCount;
          #pragma omp parallel shared(localClusterMemberCount,clusterMemberCount)
          {

          // *** Make activeCluster a thread-local variable rather than a shared array (shared array will result in false sharing)
          int activeCluster;
          #pragma omp single
          {
          localClusterMemberCount = (int **) malloc (omp_get_num_threads() * sizeof(int *));
          //localClusterMemberCount[0] = (int *) calloc (omp_get_num_threads()*numClusters,sizeof(int));
          for (int i = 0; i < omp_get_num_threads(); ++i) {
          localClusterMemberCount[i] = calloc (numClusters,sizeof(int));
          //localClusterMemberCount[i] = localClusterMemberCount[i-1] + numClusters;
          }
          }

          // sum all points
          // for every point
          for (int i = 0; i < numObjs; ++i) {
          // which cluster is it in?
          activeCluster = clusterAssignmentCurrent[i];
          // update count of members in that cluster
          ++localClusterMemberCount[omp_get_thread_num()][activeCluster];
          // sum point coordinates for finding centroid

          // *** This may be slower in parallel because of the atomic operation
          for (int j = 0; j < numCoords; ++j)
          #pragma omp atomic
          clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
          }

          /* ***
          * Missing: one reduction step
          * The global cluster member count needs to be updated
          * one option is below :
          */
          #pragma omp critical
          for (int i=0; i < numClusters; ++i) clusterMemberCount+=localClusterMemberCount[omp_get_thread_num()];
          #pragma omp barrier // wait here before moving on



          // *** The code below was wrong; to compute the average, coordinates should be divided by the global count
          // *** Sucessive divisions by local count will fail. Like, 1/(4+6) is not the same as (1/4)/6

          // now divide each coordinate sum by number of members to find mean/centroid
          // for each cluster
          #pragma omp for
          for (int i = 0; i < numClusters; ++i) {
          if (clusterMemberCount != 0)
          // for each numCoordsension
          #pragma omp simd //not sure this will help, the compiler may already vectorize that
          for (int j = 0; j < numCoords; ++j)
          clustersCentroID[i*numCoords + j] /= clusterMemberCount[i]; /// XXXX will divide by zero here for any empty clusters!
          // *** ^^ atomic is not needed
          // *** only one thread will access each value of clusterCentroID

          }

          #pragma omp barrier
          /* ***
          * A barrier is needed otherwise the first thread arriving there will start to free the memory
          * Other threads may still be in the previous loop attempting to access localClusterMemberCount
          * If the pointer has been freed already, this will result in a segfault
          *
          * With the corrected code, the implicit barrier at the end of the distributed
          * for loop would be sufficient. With your initial code, an explicit barrier
          * would have been needed.
          */

          // free memory
          #pragma omp single
          {
          // *** Need to free all pointers and not only the first one
          for (int i = 0; i < omp_get_num_threads(); ++i) free (localClusterMemberCount[i]);
          free (localClusterMemberCount);
          }
          }
          free(clusterMemberCount);





          share|improve this answer













          Two reasons for the segfault:





          • localClusterMemberCount should be a shared variable declared outside of the parallel region, initialized within the parallel region by a single thread. Otherwise, each thread has its own copy of the variable, and for all but the thread that has gone through the singlesection, that points to a random location of the memory.

          • An implicit or explicit barrier is needed before the section of code where pointers are freed. All threads needs to be done for sure before memory can be disallocated, otherwise one thread may free pointers still being used by other threads.


          There are few other issues with the code. See below with my own comments flagged with ***:



          void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {

          int * clusterMemberCount = (int *) calloc (numClusters,sizeof(int));

          /* ***
          * This has to be a shared variable that each thread can access
          * If declared inside the parallel region, it will be a thread-local variable
          * which is left un-initialized for all but one thread. Further attempts to access
          * that variable will lead to segfaults
          */
          int ** localClusterMemberCount;
          #pragma omp parallel shared(localClusterMemberCount,clusterMemberCount)
          {

          // *** Make activeCluster a thread-local variable rather than a shared array (shared array will result in false sharing)
          int activeCluster;
          #pragma omp single
          {
          localClusterMemberCount = (int **) malloc (omp_get_num_threads() * sizeof(int *));
          //localClusterMemberCount[0] = (int *) calloc (omp_get_num_threads()*numClusters,sizeof(int));
          for (int i = 0; i < omp_get_num_threads(); ++i) {
          localClusterMemberCount[i] = calloc (numClusters,sizeof(int));
          //localClusterMemberCount[i] = localClusterMemberCount[i-1] + numClusters;
          }
          }

          // sum all points
          // for every point
          for (int i = 0; i < numObjs; ++i) {
          // which cluster is it in?
          activeCluster = clusterAssignmentCurrent[i];
          // update count of members in that cluster
          ++localClusterMemberCount[omp_get_thread_num()][activeCluster];
          // sum point coordinates for finding centroid

          // *** This may be slower in parallel because of the atomic operation
          for (int j = 0; j < numCoords; ++j)
          #pragma omp atomic
          clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
          }

          /* ***
          * Missing: one reduction step
          * The global cluster member count needs to be updated
          * one option is below :
          */
          #pragma omp critical
          for (int i=0; i < numClusters; ++i) clusterMemberCount+=localClusterMemberCount[omp_get_thread_num()];
          #pragma omp barrier // wait here before moving on



          // *** The code below was wrong; to compute the average, coordinates should be divided by the global count
          // *** Sucessive divisions by local count will fail. Like, 1/(4+6) is not the same as (1/4)/6

          // now divide each coordinate sum by number of members to find mean/centroid
          // for each cluster
          #pragma omp for
          for (int i = 0; i < numClusters; ++i) {
          if (clusterMemberCount != 0)
          // for each numCoordsension
          #pragma omp simd //not sure this will help, the compiler may already vectorize that
          for (int j = 0; j < numCoords; ++j)
          clustersCentroID[i*numCoords + j] /= clusterMemberCount[i]; /// XXXX will divide by zero here for any empty clusters!
          // *** ^^ atomic is not needed
          // *** only one thread will access each value of clusterCentroID

          }

          #pragma omp barrier
          /* ***
          * A barrier is needed otherwise the first thread arriving there will start to free the memory
          * Other threads may still be in the previous loop attempting to access localClusterMemberCount
          * If the pointer has been freed already, this will result in a segfault
          *
          * With the corrected code, the implicit barrier at the end of the distributed
          * for loop would be sufficient. With your initial code, an explicit barrier
          * would have been needed.
          */

          // free memory
          #pragma omp single
          {
          // *** Need to free all pointers and not only the first one
          for (int i = 0; i < omp_get_num_threads(); ++i) free (localClusterMemberCount[i]);
          free (localClusterMemberCount);
          }
          }
          free(clusterMemberCount);






          share|improve this answer












          share|improve this answer



          share|improve this answer










          answered Nov 21 '18 at 13:28









          BriceBrice

          1,400110




          1,400110













          • Now the code is working. It lose a lot of troughput due to barriers and criticals, but that is other problem. Declarations and mallocing are working perfectly. Thank you.

            – JuMoGar
            Nov 21 '18 at 13:50











          • How many dimensions numCoords do you have ?

            – Brice
            Nov 21 '18 at 14:43











          • numCoords is 13 and numObjs is 239108

            – JuMoGar
            Nov 21 '18 at 15:00



















          • Now the code is working. It lose a lot of troughput due to barriers and criticals, but that is other problem. Declarations and mallocing are working perfectly. Thank you.

            – JuMoGar
            Nov 21 '18 at 13:50











          • How many dimensions numCoords do you have ?

            – Brice
            Nov 21 '18 at 14:43











          • numCoords is 13 and numObjs is 239108

            – JuMoGar
            Nov 21 '18 at 15:00

















          Now the code is working. It lose a lot of troughput due to barriers and criticals, but that is other problem. Declarations and mallocing are working perfectly. Thank you.

          – JuMoGar
          Nov 21 '18 at 13:50





          Now the code is working. It lose a lot of troughput due to barriers and criticals, but that is other problem. Declarations and mallocing are working perfectly. Thank you.

          – JuMoGar
          Nov 21 '18 at 13:50













          How many dimensions numCoords do you have ?

          – Brice
          Nov 21 '18 at 14:43





          How many dimensions numCoords do you have ?

          – Brice
          Nov 21 '18 at 14:43













          numCoords is 13 and numObjs is 239108

          – JuMoGar
          Nov 21 '18 at 15:00





          numCoords is 13 and numObjs is 239108

          – JuMoGar
          Nov 21 '18 at 15:00


















          draft saved

          draft discarded




















































          Thanks for contributing an answer to Stack Overflow!


          • Please be sure to answer the question. Provide details and share your research!

          But avoid



          • Asking for help, clarification, or responding to other answers.

          • Making statements based on opinion; back them up with references or personal experience.


          To learn more, see our tips on writing great answers.




          draft saved


          draft discarded














          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53412033%2fhow-to-declare-and-malloc-pointer-inside-openmp-parallel-region-error-segment%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown





















































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown

































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown







          Popular posts from this blog

          Paul Cézanne

          UIScrollView CustomStickyHeader Resize height generates problems when scroll is too fast

          Angular material date-picker (MatDatepicker) auto completes the date on focus out