How to declare and malloc pointer inside OpenMP parallel region? (ERROR: Segment violation ('core'...
I am doing it like:
void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {
int * clusterMemberCount = (int *) calloc (numClusters,sizeof(int));
#pragma omp parallel
{
int ** localClusterMemberCount;
int * activeCluster;
#pragma omp single
{
localClusterMemberCount = (int **) malloc (omp_get_num_threads() * sizeof(int *));
//localClusterMemberCount[0] = (int *) calloc (omp_get_num_threads()*numClusters,sizeof(int));
for (int i = 0; i < omp_get_num_threads(); ++i) {
localClusterMemberCount[i] = calloc (numClusters,sizeof(int));
//localClusterMemberCount[i] = localClusterMemberCount[i-1] + numClusters;
}
activeCluster = (int *) calloc (omp_get_num_threads(),sizeof(int));
}
// sum all points
// for every point
for (int i = 0; i < numObjs; ++i) {
// which cluster is it in?
activeCluster[omp_get_thread_num()] = clusterAssignmentCurrent[i];
// update count of members in that cluster
++localClusterMemberCount[omp_get_thread_num()][activeCluster[omp_get_thread_num()]];
// sum point coordinates for finding centroid
for (int j = 0; j < numCoords; ++j)
#pragma omp atomic
clustersCentroID[activeCluster[omp_get_thread_num()]*numCoords + j] += dataSetMatrix[i*numCoords + j];
}
// now divide each coordinate sum by number of members to find mean/centroid
// for each cluster
for (int i = 0; i < numClusters; ++i) {
if (localClusterMemberCount[omp_get_thread_num()][i] != 0)
// for each numCoordsension
for (int j = 0; j < numCoords; ++j)
#pragma omp atomic
clustersCentroID[i*numCoords + j] /= localClusterMemberCount[omp_get_thread_num()][i]; /// XXXX will divide by zero here for any empty clusters!
}
// free memory
#pragma omp single
{
free (localClusterMemberCount[0]);
free (localClusterMemberCount);
free (activeCluster);
}
}
free(clusterMemberCount);
But I get the error: Segment violation ('core' generated) so I am doing something bad, and I think the error is on mallocing pointers due to I have tried sequential code and it is working fine. Also I have tried parallel code but without mallocs (using globals variables with atomic) and that works fine too. The error only apears when I try to create private pointers and malloc them.
Any idea how can I solve it?
c openmp
add a comment |
I am doing it like:
void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {
int * clusterMemberCount = (int *) calloc (numClusters,sizeof(int));
#pragma omp parallel
{
int ** localClusterMemberCount;
int * activeCluster;
#pragma omp single
{
localClusterMemberCount = (int **) malloc (omp_get_num_threads() * sizeof(int *));
//localClusterMemberCount[0] = (int *) calloc (omp_get_num_threads()*numClusters,sizeof(int));
for (int i = 0; i < omp_get_num_threads(); ++i) {
localClusterMemberCount[i] = calloc (numClusters,sizeof(int));
//localClusterMemberCount[i] = localClusterMemberCount[i-1] + numClusters;
}
activeCluster = (int *) calloc (omp_get_num_threads(),sizeof(int));
}
// sum all points
// for every point
for (int i = 0; i < numObjs; ++i) {
// which cluster is it in?
activeCluster[omp_get_thread_num()] = clusterAssignmentCurrent[i];
// update count of members in that cluster
++localClusterMemberCount[omp_get_thread_num()][activeCluster[omp_get_thread_num()]];
// sum point coordinates for finding centroid
for (int j = 0; j < numCoords; ++j)
#pragma omp atomic
clustersCentroID[activeCluster[omp_get_thread_num()]*numCoords + j] += dataSetMatrix[i*numCoords + j];
}
// now divide each coordinate sum by number of members to find mean/centroid
// for each cluster
for (int i = 0; i < numClusters; ++i) {
if (localClusterMemberCount[omp_get_thread_num()][i] != 0)
// for each numCoordsension
for (int j = 0; j < numCoords; ++j)
#pragma omp atomic
clustersCentroID[i*numCoords + j] /= localClusterMemberCount[omp_get_thread_num()][i]; /// XXXX will divide by zero here for any empty clusters!
}
// free memory
#pragma omp single
{
free (localClusterMemberCount[0]);
free (localClusterMemberCount);
free (activeCluster);
}
}
free(clusterMemberCount);
But I get the error: Segment violation ('core' generated) so I am doing something bad, and I think the error is on mallocing pointers due to I have tried sequential code and it is working fine. Also I have tried parallel code but without mallocs (using globals variables with atomic) and that works fine too. The error only apears when I try to create private pointers and malloc them.
Any idea how can I solve it?
c openmp
add a comment |
I am doing it like:
void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {
int * clusterMemberCount = (int *) calloc (numClusters,sizeof(int));
#pragma omp parallel
{
int ** localClusterMemberCount;
int * activeCluster;
#pragma omp single
{
localClusterMemberCount = (int **) malloc (omp_get_num_threads() * sizeof(int *));
//localClusterMemberCount[0] = (int *) calloc (omp_get_num_threads()*numClusters,sizeof(int));
for (int i = 0; i < omp_get_num_threads(); ++i) {
localClusterMemberCount[i] = calloc (numClusters,sizeof(int));
//localClusterMemberCount[i] = localClusterMemberCount[i-1] + numClusters;
}
activeCluster = (int *) calloc (omp_get_num_threads(),sizeof(int));
}
// sum all points
// for every point
for (int i = 0; i < numObjs; ++i) {
// which cluster is it in?
activeCluster[omp_get_thread_num()] = clusterAssignmentCurrent[i];
// update count of members in that cluster
++localClusterMemberCount[omp_get_thread_num()][activeCluster[omp_get_thread_num()]];
// sum point coordinates for finding centroid
for (int j = 0; j < numCoords; ++j)
#pragma omp atomic
clustersCentroID[activeCluster[omp_get_thread_num()]*numCoords + j] += dataSetMatrix[i*numCoords + j];
}
// now divide each coordinate sum by number of members to find mean/centroid
// for each cluster
for (int i = 0; i < numClusters; ++i) {
if (localClusterMemberCount[omp_get_thread_num()][i] != 0)
// for each numCoordsension
for (int j = 0; j < numCoords; ++j)
#pragma omp atomic
clustersCentroID[i*numCoords + j] /= localClusterMemberCount[omp_get_thread_num()][i]; /// XXXX will divide by zero here for any empty clusters!
}
// free memory
#pragma omp single
{
free (localClusterMemberCount[0]);
free (localClusterMemberCount);
free (activeCluster);
}
}
free(clusterMemberCount);
But I get the error: Segment violation ('core' generated) so I am doing something bad, and I think the error is on mallocing pointers due to I have tried sequential code and it is working fine. Also I have tried parallel code but without mallocs (using globals variables with atomic) and that works fine too. The error only apears when I try to create private pointers and malloc them.
Any idea how can I solve it?
c openmp
I am doing it like:
void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {
int * clusterMemberCount = (int *) calloc (numClusters,sizeof(int));
#pragma omp parallel
{
int ** localClusterMemberCount;
int * activeCluster;
#pragma omp single
{
localClusterMemberCount = (int **) malloc (omp_get_num_threads() * sizeof(int *));
//localClusterMemberCount[0] = (int *) calloc (omp_get_num_threads()*numClusters,sizeof(int));
for (int i = 0; i < omp_get_num_threads(); ++i) {
localClusterMemberCount[i] = calloc (numClusters,sizeof(int));
//localClusterMemberCount[i] = localClusterMemberCount[i-1] + numClusters;
}
activeCluster = (int *) calloc (omp_get_num_threads(),sizeof(int));
}
// sum all points
// for every point
for (int i = 0; i < numObjs; ++i) {
// which cluster is it in?
activeCluster[omp_get_thread_num()] = clusterAssignmentCurrent[i];
// update count of members in that cluster
++localClusterMemberCount[omp_get_thread_num()][activeCluster[omp_get_thread_num()]];
// sum point coordinates for finding centroid
for (int j = 0; j < numCoords; ++j)
#pragma omp atomic
clustersCentroID[activeCluster[omp_get_thread_num()]*numCoords + j] += dataSetMatrix[i*numCoords + j];
}
// now divide each coordinate sum by number of members to find mean/centroid
// for each cluster
for (int i = 0; i < numClusters; ++i) {
if (localClusterMemberCount[omp_get_thread_num()][i] != 0)
// for each numCoordsension
for (int j = 0; j < numCoords; ++j)
#pragma omp atomic
clustersCentroID[i*numCoords + j] /= localClusterMemberCount[omp_get_thread_num()][i]; /// XXXX will divide by zero here for any empty clusters!
}
// free memory
#pragma omp single
{
free (localClusterMemberCount[0]);
free (localClusterMemberCount);
free (activeCluster);
}
}
free(clusterMemberCount);
But I get the error: Segment violation ('core' generated) so I am doing something bad, and I think the error is on mallocing pointers due to I have tried sequential code and it is working fine. Also I have tried parallel code but without mallocs (using globals variables with atomic) and that works fine too. The error only apears when I try to create private pointers and malloc them.
Any idea how can I solve it?
c openmp
c openmp
asked Nov 21 '18 at 12:27
JuMoGarJuMoGar
8691218
8691218
add a comment |
add a comment |
1 Answer
1
active
oldest
votes
Two reasons for the segfault:
localClusterMemberCountshould be a shared variable declared outside of the parallel region, initialized within the parallel region by a single thread. Otherwise, each thread has its own copy of the variable, and for all but the thread that has gone through thesinglesection, that points to a random location of the memory.- An implicit or explicit barrier is needed before the section of code where pointers are freed. All threads needs to be done for sure before memory can be disallocated, otherwise one thread may free pointers still being used by other threads.
There are few other issues with the code. See below with my own comments flagged with ***:
void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {
int * clusterMemberCount = (int *) calloc (numClusters,sizeof(int));
/* ***
* This has to be a shared variable that each thread can access
* If declared inside the parallel region, it will be a thread-local variable
* which is left un-initialized for all but one thread. Further attempts to access
* that variable will lead to segfaults
*/
int ** localClusterMemberCount;
#pragma omp parallel shared(localClusterMemberCount,clusterMemberCount)
{
// *** Make activeCluster a thread-local variable rather than a shared array (shared array will result in false sharing)
int activeCluster;
#pragma omp single
{
localClusterMemberCount = (int **) malloc (omp_get_num_threads() * sizeof(int *));
//localClusterMemberCount[0] = (int *) calloc (omp_get_num_threads()*numClusters,sizeof(int));
for (int i = 0; i < omp_get_num_threads(); ++i) {
localClusterMemberCount[i] = calloc (numClusters,sizeof(int));
//localClusterMemberCount[i] = localClusterMemberCount[i-1] + numClusters;
}
}
// sum all points
// for every point
for (int i = 0; i < numObjs; ++i) {
// which cluster is it in?
activeCluster = clusterAssignmentCurrent[i];
// update count of members in that cluster
++localClusterMemberCount[omp_get_thread_num()][activeCluster];
// sum point coordinates for finding centroid
// *** This may be slower in parallel because of the atomic operation
for (int j = 0; j < numCoords; ++j)
#pragma omp atomic
clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
}
/* ***
* Missing: one reduction step
* The global cluster member count needs to be updated
* one option is below :
*/
#pragma omp critical
for (int i=0; i < numClusters; ++i) clusterMemberCount+=localClusterMemberCount[omp_get_thread_num()];
#pragma omp barrier // wait here before moving on
// *** The code below was wrong; to compute the average, coordinates should be divided by the global count
// *** Sucessive divisions by local count will fail. Like, 1/(4+6) is not the same as (1/4)/6
// now divide each coordinate sum by number of members to find mean/centroid
// for each cluster
#pragma omp for
for (int i = 0; i < numClusters; ++i) {
if (clusterMemberCount != 0)
// for each numCoordsension
#pragma omp simd //not sure this will help, the compiler may already vectorize that
for (int j = 0; j < numCoords; ++j)
clustersCentroID[i*numCoords + j] /= clusterMemberCount[i]; /// XXXX will divide by zero here for any empty clusters!
// *** ^^ atomic is not needed
// *** only one thread will access each value of clusterCentroID
}
#pragma omp barrier
/* ***
* A barrier is needed otherwise the first thread arriving there will start to free the memory
* Other threads may still be in the previous loop attempting to access localClusterMemberCount
* If the pointer has been freed already, this will result in a segfault
*
* With the corrected code, the implicit barrier at the end of the distributed
* for loop would be sufficient. With your initial code, an explicit barrier
* would have been needed.
*/
// free memory
#pragma omp single
{
// *** Need to free all pointers and not only the first one
for (int i = 0; i < omp_get_num_threads(); ++i) free (localClusterMemberCount[i]);
free (localClusterMemberCount);
}
}
free(clusterMemberCount);
Now the code is working. It lose a lot of troughput due to barriers and criticals, but that is other problem. Declarations and mallocing are working perfectly. Thank you.
– JuMoGar
Nov 21 '18 at 13:50
How many dimensionsnumCoordsdo you have ?
– Brice
Nov 21 '18 at 14:43
numCoordsis 13 andnumObjsis 239108
– JuMoGar
Nov 21 '18 at 15:00
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53412033%2fhow-to-declare-and-malloc-pointer-inside-openmp-parallel-region-error-segment%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
Two reasons for the segfault:
localClusterMemberCountshould be a shared variable declared outside of the parallel region, initialized within the parallel region by a single thread. Otherwise, each thread has its own copy of the variable, and for all but the thread that has gone through thesinglesection, that points to a random location of the memory.- An implicit or explicit barrier is needed before the section of code where pointers are freed. All threads needs to be done for sure before memory can be disallocated, otherwise one thread may free pointers still being used by other threads.
There are few other issues with the code. See below with my own comments flagged with ***:
void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {
int * clusterMemberCount = (int *) calloc (numClusters,sizeof(int));
/* ***
* This has to be a shared variable that each thread can access
* If declared inside the parallel region, it will be a thread-local variable
* which is left un-initialized for all but one thread. Further attempts to access
* that variable will lead to segfaults
*/
int ** localClusterMemberCount;
#pragma omp parallel shared(localClusterMemberCount,clusterMemberCount)
{
// *** Make activeCluster a thread-local variable rather than a shared array (shared array will result in false sharing)
int activeCluster;
#pragma omp single
{
localClusterMemberCount = (int **) malloc (omp_get_num_threads() * sizeof(int *));
//localClusterMemberCount[0] = (int *) calloc (omp_get_num_threads()*numClusters,sizeof(int));
for (int i = 0; i < omp_get_num_threads(); ++i) {
localClusterMemberCount[i] = calloc (numClusters,sizeof(int));
//localClusterMemberCount[i] = localClusterMemberCount[i-1] + numClusters;
}
}
// sum all points
// for every point
for (int i = 0; i < numObjs; ++i) {
// which cluster is it in?
activeCluster = clusterAssignmentCurrent[i];
// update count of members in that cluster
++localClusterMemberCount[omp_get_thread_num()][activeCluster];
// sum point coordinates for finding centroid
// *** This may be slower in parallel because of the atomic operation
for (int j = 0; j < numCoords; ++j)
#pragma omp atomic
clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
}
/* ***
* Missing: one reduction step
* The global cluster member count needs to be updated
* one option is below :
*/
#pragma omp critical
for (int i=0; i < numClusters; ++i) clusterMemberCount+=localClusterMemberCount[omp_get_thread_num()];
#pragma omp barrier // wait here before moving on
// *** The code below was wrong; to compute the average, coordinates should be divided by the global count
// *** Sucessive divisions by local count will fail. Like, 1/(4+6) is not the same as (1/4)/6
// now divide each coordinate sum by number of members to find mean/centroid
// for each cluster
#pragma omp for
for (int i = 0; i < numClusters; ++i) {
if (clusterMemberCount != 0)
// for each numCoordsension
#pragma omp simd //not sure this will help, the compiler may already vectorize that
for (int j = 0; j < numCoords; ++j)
clustersCentroID[i*numCoords + j] /= clusterMemberCount[i]; /// XXXX will divide by zero here for any empty clusters!
// *** ^^ atomic is not needed
// *** only one thread will access each value of clusterCentroID
}
#pragma omp barrier
/* ***
* A barrier is needed otherwise the first thread arriving there will start to free the memory
* Other threads may still be in the previous loop attempting to access localClusterMemberCount
* If the pointer has been freed already, this will result in a segfault
*
* With the corrected code, the implicit barrier at the end of the distributed
* for loop would be sufficient. With your initial code, an explicit barrier
* would have been needed.
*/
// free memory
#pragma omp single
{
// *** Need to free all pointers and not only the first one
for (int i = 0; i < omp_get_num_threads(); ++i) free (localClusterMemberCount[i]);
free (localClusterMemberCount);
}
}
free(clusterMemberCount);
Now the code is working. It lose a lot of troughput due to barriers and criticals, but that is other problem. Declarations and mallocing are working perfectly. Thank you.
– JuMoGar
Nov 21 '18 at 13:50
How many dimensionsnumCoordsdo you have ?
– Brice
Nov 21 '18 at 14:43
numCoordsis 13 andnumObjsis 239108
– JuMoGar
Nov 21 '18 at 15:00
add a comment |
Two reasons for the segfault:
localClusterMemberCountshould be a shared variable declared outside of the parallel region, initialized within the parallel region by a single thread. Otherwise, each thread has its own copy of the variable, and for all but the thread that has gone through thesinglesection, that points to a random location of the memory.- An implicit or explicit barrier is needed before the section of code where pointers are freed. All threads needs to be done for sure before memory can be disallocated, otherwise one thread may free pointers still being used by other threads.
There are few other issues with the code. See below with my own comments flagged with ***:
void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {
int * clusterMemberCount = (int *) calloc (numClusters,sizeof(int));
/* ***
* This has to be a shared variable that each thread can access
* If declared inside the parallel region, it will be a thread-local variable
* which is left un-initialized for all but one thread. Further attempts to access
* that variable will lead to segfaults
*/
int ** localClusterMemberCount;
#pragma omp parallel shared(localClusterMemberCount,clusterMemberCount)
{
// *** Make activeCluster a thread-local variable rather than a shared array (shared array will result in false sharing)
int activeCluster;
#pragma omp single
{
localClusterMemberCount = (int **) malloc (omp_get_num_threads() * sizeof(int *));
//localClusterMemberCount[0] = (int *) calloc (omp_get_num_threads()*numClusters,sizeof(int));
for (int i = 0; i < omp_get_num_threads(); ++i) {
localClusterMemberCount[i] = calloc (numClusters,sizeof(int));
//localClusterMemberCount[i] = localClusterMemberCount[i-1] + numClusters;
}
}
// sum all points
// for every point
for (int i = 0; i < numObjs; ++i) {
// which cluster is it in?
activeCluster = clusterAssignmentCurrent[i];
// update count of members in that cluster
++localClusterMemberCount[omp_get_thread_num()][activeCluster];
// sum point coordinates for finding centroid
// *** This may be slower in parallel because of the atomic operation
for (int j = 0; j < numCoords; ++j)
#pragma omp atomic
clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
}
/* ***
* Missing: one reduction step
* The global cluster member count needs to be updated
* one option is below :
*/
#pragma omp critical
for (int i=0; i < numClusters; ++i) clusterMemberCount+=localClusterMemberCount[omp_get_thread_num()];
#pragma omp barrier // wait here before moving on
// *** The code below was wrong; to compute the average, coordinates should be divided by the global count
// *** Sucessive divisions by local count will fail. Like, 1/(4+6) is not the same as (1/4)/6
// now divide each coordinate sum by number of members to find mean/centroid
// for each cluster
#pragma omp for
for (int i = 0; i < numClusters; ++i) {
if (clusterMemberCount != 0)
// for each numCoordsension
#pragma omp simd //not sure this will help, the compiler may already vectorize that
for (int j = 0; j < numCoords; ++j)
clustersCentroID[i*numCoords + j] /= clusterMemberCount[i]; /// XXXX will divide by zero here for any empty clusters!
// *** ^^ atomic is not needed
// *** only one thread will access each value of clusterCentroID
}
#pragma omp barrier
/* ***
* A barrier is needed otherwise the first thread arriving there will start to free the memory
* Other threads may still be in the previous loop attempting to access localClusterMemberCount
* If the pointer has been freed already, this will result in a segfault
*
* With the corrected code, the implicit barrier at the end of the distributed
* for loop would be sufficient. With your initial code, an explicit barrier
* would have been needed.
*/
// free memory
#pragma omp single
{
// *** Need to free all pointers and not only the first one
for (int i = 0; i < omp_get_num_threads(); ++i) free (localClusterMemberCount[i]);
free (localClusterMemberCount);
}
}
free(clusterMemberCount);
Now the code is working. It lose a lot of troughput due to barriers and criticals, but that is other problem. Declarations and mallocing are working perfectly. Thank you.
– JuMoGar
Nov 21 '18 at 13:50
How many dimensionsnumCoordsdo you have ?
– Brice
Nov 21 '18 at 14:43
numCoordsis 13 andnumObjsis 239108
– JuMoGar
Nov 21 '18 at 15:00
add a comment |
Two reasons for the segfault:
localClusterMemberCountshould be a shared variable declared outside of the parallel region, initialized within the parallel region by a single thread. Otherwise, each thread has its own copy of the variable, and for all but the thread that has gone through thesinglesection, that points to a random location of the memory.- An implicit or explicit barrier is needed before the section of code where pointers are freed. All threads needs to be done for sure before memory can be disallocated, otherwise one thread may free pointers still being used by other threads.
There are few other issues with the code. See below with my own comments flagged with ***:
void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {
int * clusterMemberCount = (int *) calloc (numClusters,sizeof(int));
/* ***
* This has to be a shared variable that each thread can access
* If declared inside the parallel region, it will be a thread-local variable
* which is left un-initialized for all but one thread. Further attempts to access
* that variable will lead to segfaults
*/
int ** localClusterMemberCount;
#pragma omp parallel shared(localClusterMemberCount,clusterMemberCount)
{
// *** Make activeCluster a thread-local variable rather than a shared array (shared array will result in false sharing)
int activeCluster;
#pragma omp single
{
localClusterMemberCount = (int **) malloc (omp_get_num_threads() * sizeof(int *));
//localClusterMemberCount[0] = (int *) calloc (omp_get_num_threads()*numClusters,sizeof(int));
for (int i = 0; i < omp_get_num_threads(); ++i) {
localClusterMemberCount[i] = calloc (numClusters,sizeof(int));
//localClusterMemberCount[i] = localClusterMemberCount[i-1] + numClusters;
}
}
// sum all points
// for every point
for (int i = 0; i < numObjs; ++i) {
// which cluster is it in?
activeCluster = clusterAssignmentCurrent[i];
// update count of members in that cluster
++localClusterMemberCount[omp_get_thread_num()][activeCluster];
// sum point coordinates for finding centroid
// *** This may be slower in parallel because of the atomic operation
for (int j = 0; j < numCoords; ++j)
#pragma omp atomic
clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
}
/* ***
* Missing: one reduction step
* The global cluster member count needs to be updated
* one option is below :
*/
#pragma omp critical
for (int i=0; i < numClusters; ++i) clusterMemberCount+=localClusterMemberCount[omp_get_thread_num()];
#pragma omp barrier // wait here before moving on
// *** The code below was wrong; to compute the average, coordinates should be divided by the global count
// *** Sucessive divisions by local count will fail. Like, 1/(4+6) is not the same as (1/4)/6
// now divide each coordinate sum by number of members to find mean/centroid
// for each cluster
#pragma omp for
for (int i = 0; i < numClusters; ++i) {
if (clusterMemberCount != 0)
// for each numCoordsension
#pragma omp simd //not sure this will help, the compiler may already vectorize that
for (int j = 0; j < numCoords; ++j)
clustersCentroID[i*numCoords + j] /= clusterMemberCount[i]; /// XXXX will divide by zero here for any empty clusters!
// *** ^^ atomic is not needed
// *** only one thread will access each value of clusterCentroID
}
#pragma omp barrier
/* ***
* A barrier is needed otherwise the first thread arriving there will start to free the memory
* Other threads may still be in the previous loop attempting to access localClusterMemberCount
* If the pointer has been freed already, this will result in a segfault
*
* With the corrected code, the implicit barrier at the end of the distributed
* for loop would be sufficient. With your initial code, an explicit barrier
* would have been needed.
*/
// free memory
#pragma omp single
{
// *** Need to free all pointers and not only the first one
for (int i = 0; i < omp_get_num_threads(); ++i) free (localClusterMemberCount[i]);
free (localClusterMemberCount);
}
}
free(clusterMemberCount);
Two reasons for the segfault:
localClusterMemberCountshould be a shared variable declared outside of the parallel region, initialized within the parallel region by a single thread. Otherwise, each thread has its own copy of the variable, and for all but the thread that has gone through thesinglesection, that points to a random location of the memory.- An implicit or explicit barrier is needed before the section of code where pointers are freed. All threads needs to be done for sure before memory can be disallocated, otherwise one thread may free pointers still being used by other threads.
There are few other issues with the code. See below with my own comments flagged with ***:
void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {
int * clusterMemberCount = (int *) calloc (numClusters,sizeof(int));
/* ***
* This has to be a shared variable that each thread can access
* If declared inside the parallel region, it will be a thread-local variable
* which is left un-initialized for all but one thread. Further attempts to access
* that variable will lead to segfaults
*/
int ** localClusterMemberCount;
#pragma omp parallel shared(localClusterMemberCount,clusterMemberCount)
{
// *** Make activeCluster a thread-local variable rather than a shared array (shared array will result in false sharing)
int activeCluster;
#pragma omp single
{
localClusterMemberCount = (int **) malloc (omp_get_num_threads() * sizeof(int *));
//localClusterMemberCount[0] = (int *) calloc (omp_get_num_threads()*numClusters,sizeof(int));
for (int i = 0; i < omp_get_num_threads(); ++i) {
localClusterMemberCount[i] = calloc (numClusters,sizeof(int));
//localClusterMemberCount[i] = localClusterMemberCount[i-1] + numClusters;
}
}
// sum all points
// for every point
for (int i = 0; i < numObjs; ++i) {
// which cluster is it in?
activeCluster = clusterAssignmentCurrent[i];
// update count of members in that cluster
++localClusterMemberCount[omp_get_thread_num()][activeCluster];
// sum point coordinates for finding centroid
// *** This may be slower in parallel because of the atomic operation
for (int j = 0; j < numCoords; ++j)
#pragma omp atomic
clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
}
/* ***
* Missing: one reduction step
* The global cluster member count needs to be updated
* one option is below :
*/
#pragma omp critical
for (int i=0; i < numClusters; ++i) clusterMemberCount+=localClusterMemberCount[omp_get_thread_num()];
#pragma omp barrier // wait here before moving on
// *** The code below was wrong; to compute the average, coordinates should be divided by the global count
// *** Sucessive divisions by local count will fail. Like, 1/(4+6) is not the same as (1/4)/6
// now divide each coordinate sum by number of members to find mean/centroid
// for each cluster
#pragma omp for
for (int i = 0; i < numClusters; ++i) {
if (clusterMemberCount != 0)
// for each numCoordsension
#pragma omp simd //not sure this will help, the compiler may already vectorize that
for (int j = 0; j < numCoords; ++j)
clustersCentroID[i*numCoords + j] /= clusterMemberCount[i]; /// XXXX will divide by zero here for any empty clusters!
// *** ^^ atomic is not needed
// *** only one thread will access each value of clusterCentroID
}
#pragma omp barrier
/* ***
* A barrier is needed otherwise the first thread arriving there will start to free the memory
* Other threads may still be in the previous loop attempting to access localClusterMemberCount
* If the pointer has been freed already, this will result in a segfault
*
* With the corrected code, the implicit barrier at the end of the distributed
* for loop would be sufficient. With your initial code, an explicit barrier
* would have been needed.
*/
// free memory
#pragma omp single
{
// *** Need to free all pointers and not only the first one
for (int i = 0; i < omp_get_num_threads(); ++i) free (localClusterMemberCount[i]);
free (localClusterMemberCount);
}
}
free(clusterMemberCount);
answered Nov 21 '18 at 13:28
BriceBrice
1,400110
1,400110
Now the code is working. It lose a lot of troughput due to barriers and criticals, but that is other problem. Declarations and mallocing are working perfectly. Thank you.
– JuMoGar
Nov 21 '18 at 13:50
How many dimensionsnumCoordsdo you have ?
– Brice
Nov 21 '18 at 14:43
numCoordsis 13 andnumObjsis 239108
– JuMoGar
Nov 21 '18 at 15:00
add a comment |
Now the code is working. It lose a lot of troughput due to barriers and criticals, but that is other problem. Declarations and mallocing are working perfectly. Thank you.
– JuMoGar
Nov 21 '18 at 13:50
How many dimensionsnumCoordsdo you have ?
– Brice
Nov 21 '18 at 14:43
numCoordsis 13 andnumObjsis 239108
– JuMoGar
Nov 21 '18 at 15:00
Now the code is working. It lose a lot of troughput due to barriers and criticals, but that is other problem. Declarations and mallocing are working perfectly. Thank you.
– JuMoGar
Nov 21 '18 at 13:50
Now the code is working. It lose a lot of troughput due to barriers and criticals, but that is other problem. Declarations and mallocing are working perfectly. Thank you.
– JuMoGar
Nov 21 '18 at 13:50
How many dimensions
numCoords do you have ?– Brice
Nov 21 '18 at 14:43
How many dimensions
numCoords do you have ?– Brice
Nov 21 '18 at 14:43
numCoords is 13 and numObjs is 239108– JuMoGar
Nov 21 '18 at 15:00
numCoords is 13 and numObjs is 239108– JuMoGar
Nov 21 '18 at 15:00
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53412033%2fhow-to-declare-and-malloc-pointer-inside-openmp-parallel-region-error-segment%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown