@@ -8,18 +8,14 @@ namespace op
88 const float* const keypointsPtr, const int numberPeople, const int numberParts, const float threshold)
99 {
1010 const auto globalIdx = threadIdx.x;
11- // const auto xIndex = 2*globalIdx;
12- // const auto yIndex = xIndex+1;
13- const auto xIndex = globalIdx;
14- const auto yIndex = numberPeople+globalIdx;
1511
1612 // Fill shared parameters
17- // if (globalIdx < numberPeople)
13+ if (globalIdx < numberPeople)
1814 {
19- auto minValueX = (float)targetWidth;
20- auto minValueY = (float)targetHeight;
21- auto maxValueX = 0.f;
22- auto maxValueY = 0.f;
15+ float minValueX = (float)targetWidth;
16+ float minValueY = (float)targetHeight;
17+ float maxValueX = 0.f;
18+ float maxValueY = 0.f;
2319 for (auto part = 0 ; part < numberParts ; part++)
2420 {
2521 const auto index = 3 * (globalIdx*numberParts + part);
@@ -51,13 +47,18 @@ namespace op
5147 minValueY -= constantToAdd;
5248 }
5349
50+ // const auto xIndex = 2*globalIdx;
51+ // const auto yIndex = xIndex+1;
52+ const auto xIndex = globalIdx;
53+ const auto yIndex = numberPeople+globalIdx;
5454 minPtr[xIndex] = minValueX;
5555 minPtr[yIndex] = minValueY;
5656 maxPtr[xIndex] = maxValueX;
5757 maxPtr[yIndex] = maxValueY;
5858 }
5959 }
6060
61+ // Note: renderKeypoints is not working for videos with many people, renderKeypointsOld speed was slightly improved instead
6162 __inline__ __device__ void renderKeypoints(
6263 float* targetPtr, float* sharedMaxs, float* sharedMins, float* sharedScaleF, const float* const maxPtr,
6364 const float* const minPtr, const float* const scalePtr, const int globalIdx, const int x, const int y,
@@ -82,9 +83,9 @@ namespace op
8283 if (x < targetWidth && y < targetHeight)
8384 {
8485 const auto baseIndex = 3*(y * targetWidth + x);
85- auto b = targetPtr[baseIndex];
86- auto g = targetPtr[baseIndex+1];
87- auto r = targetPtr[baseIndex+2];
86+ float b = targetPtr[baseIndex];
87+ float g = targetPtr[baseIndex+1];
88+ float r = targetPtr[baseIndex+2];
8889 if (!blendOriginalFrame)
8990 {
9091 b = 0.f;
@@ -104,7 +105,7 @@ namespace op
104105 const auto xIndex = person;
105106 const auto yIndex = numberPeople+person;
106107 if (x <= sharedMaxs[xIndex] && x >= sharedMins[xIndex]
107- && y <= sharedMaxs[yIndex] && y >= sharedMins[yIndex])
108+ && y <= sharedMaxs[yIndex] && y >= sharedMins[yIndex])
108109 {
109110 // Part pair connections
110111 for (auto partPair = 0; partPair < numberPartPairs; partPair++)
@@ -195,7 +196,6 @@ namespace op
195196 if (minr2 <= dist2 && dist2 <= maxr2)
196197 addColorWeighted(r, g, b, &rgbColorsPtr[(part%numberColors)*3], alphaColorToAdd);
197198 }
198-
199199 }
200200 }
201201 }
@@ -218,10 +218,10 @@ namespace op
218218 // Fill shared parameters
219219 if (globalIdx < numberPeople)
220220 {
221- sharedMins[globalIdx].x = targetWidth;
222- sharedMins[globalIdx].y = targetHeight;
223- sharedMaxs[globalIdx].x = 0.f;
224- sharedMaxs[globalIdx].y = 0.f;
221+ float minValueX = (float) targetWidth;
222+ float minValueY = (float) targetHeight;
223+ float maxValueX = 0.f;
224+ float maxValueY = 0.f;
225225 for (auto part = 0 ; part < numberParts ; part++)
226226 {
227227 const auto index = 3 * (globalIdx*numberParts + part);
@@ -230,38 +230,43 @@ namespace op
230230 const auto score = keypointsPtr[index+2];
231231 if (score > threshold)
232232 {
233- if (x < sharedMins[globalIdx].x )
234- sharedMins[globalIdx].x = x;
235- if (x > sharedMaxs[globalIdx].x )
236- sharedMaxs[globalIdx].x = x;
237- if (y < sharedMins[globalIdx].y )
238- sharedMins[globalIdx].y = y;
239- if (y > sharedMaxs[globalIdx].y )
240- sharedMaxs[globalIdx].y = y;
233+ if (x < minValueX )
234+ minValueX = x;
235+ if (x > maxValueX )
236+ maxValueX = x;
237+ if (y < minValueY )
238+ minValueY = y;
239+ if (y > maxValueY )
240+ maxValueY = y;
241241 }
242242 }
243- if (sharedMaxs[globalIdx].x != 0.f && sharedMaxs[globalIdx].y != 0.f)
243+ if (maxValueX != 0.f && maxValueY != 0.f)
244244 {
245- const auto averageX = sharedMaxs[globalIdx].x - sharedMins[globalIdx].x ;
246- const auto averageY = sharedMaxs[globalIdx].y - sharedMins[globalIdx].y ;
245+ const auto averageX = maxValueX - minValueX ;
246+ const auto averageY = maxValueY - minValueY ;
247247 // (averageX + averageY) / 2.f / 400.f
248248 sharedScaleF[globalIdx] = fastTruncateCuda((averageX + averageY) / 400.f, 0.33f, 1.f);
249249 const auto constantToAdd = 50.f;
250- sharedMaxs[globalIdx].x += constantToAdd;
251- sharedMaxs[globalIdx].y += constantToAdd;
252- sharedMins[globalIdx].x -= constantToAdd;
253- sharedMins[globalIdx].y -= constantToAdd;
250+ maxValueX += constantToAdd;
251+ maxValueY += constantToAdd;
252+ minValueX -= constantToAdd;
253+ minValueY -= constantToAdd;
254254 }
255+
256+ sharedMins[globalIdx].x = minValueX;
257+ sharedMins[globalIdx].y = minValueY;
258+ sharedMaxs[globalIdx].x = maxValueX;
259+ sharedMaxs[globalIdx].y = maxValueY;
255260 }
256261 __syncthreads();
257262
258263 // Fill each (x,y) target pixel
259264 if (x < targetWidth && y < targetHeight)
260265 {
261266 const auto baseIndex = 3*(y * targetWidth + x);
262- auto& b = targetPtr[baseIndex];
263- auto& g = targetPtr[baseIndex+1];
264- auto& r = targetPtr[baseIndex+2];
267+ float b = targetPtr[baseIndex];
268+ float g = targetPtr[baseIndex+1];
269+ float r = targetPtr[baseIndex+2];
265270 if (!blendOriginalFrame)
266271 {
267272 b = 0.f;
@@ -372,6 +377,9 @@ namespace op
372377 }
373378 }
374379 }
380+ targetPtr[baseIndex] = b;
381+ targetPtr[baseIndex+1] = g;
382+ targetPtr[baseIndex+2] = r;
375383 }
376384 }
377385}
0 commit comments