java - Pourquoi traiter un tableau trié plus rapidement que le traitement d'un tableau non trié

Mots clés : javac++performancecpu-architecturebranch-predictionjava

meilleur 4 Réponses java - Pourquoi traiter un tableau trié plus rapidement que le traitement d'un tableau non trié

vote vote

100

if (data[c] >= 128)     sum += data[c]; 
T = branch taken N = branch not taken  data[] = 0, 1, 2, 3, 4, ... 126, 127, 128, 129, 130, ... 250, 251, 252, ... branch = N  N  N  N  N  ...   N    N    T    T    T  ...   T    T    T  ...         = NNNNNNNNNNNN ... NNNNNNNTTTTTTTTT ... TTTTTTTTTT  (easy to predict) 
data[] = 226, 185, 125, 158, 198, 144, 217, 79, 202, 118,  14, 150, 177, 182, ... branch =   T,   T,   N,   T,   T,   T,   T,  N,   T,   N,   N,   T,   T,   T  ...         = TTNTTTTNTNNTTT ...   (completely random - impossible to predict) 
if (data[c] >= 128)     sum += data[c]; 
int t = (data[c] - 128) >> 31; sum += ~t & data[c]; 
vote vote

84

if (data[c] >= 128)     sum += data[c]; 
sum += data[c] >=128 ? data[c] : 0; 
int max1(int a, int b) {     if (a > b)         return a;     else         return b; } 
int max2(int a, int b) {     return a > b ? a : b; } 
:max1     movl    %edi, -4(%rbp)     movl    %esi, -8(%rbp)     movl    -4(%rbp), %eax     cmpl    -8(%rbp), %eax     jle     .L2     movl    -4(%rbp), %eax     movl    %eax, -12(%rbp)     jmp     .L4 .L2:     movl    -8(%rbp), %eax     movl    %eax, -12(%rbp) .L4:     movl    -12(%rbp), %eax     leave     ret  :max2     movl    %edi, -4(%rbp)     movl    %esi, -8(%rbp)     movl    -4(%rbp), %eax     cmpl    %eax, -8(%rbp)     cmovge  -8(%rbp), %eax     leave     ret 
vote vote

80

for (unsigned i = 0; i < 100000; ++i) {     for (unsigned j = 0; j < arraySize; ++j)     {         if (data[j] >= 128)             sum += data[j];     } } 
for (unsigned j = 0; j < arraySize; ++j) {     for (unsigned i = 0; i < 100000; ++i)     {         if (data[j] >= 128)             sum += data[j];     } } 
for (unsigned j = 0; j < arraySize; ++j) {     if (data[j] >= 128)     {         for (unsigned i = 0; i < 100000; ++i)         {             sum += data[j];         }     } } 
for (unsigned j = 0; j < arraySize; ++j) {     if (data[j] >= 128)     {         sum += data[j] * 100000;     } } 
vote vote

64

==32551== Branches:        656,645,130  (  656,609,208 cond +    35,922 ind) ==32551== Mispredicts:         169,556  (      169,095 cond +       461 ind) ==32551== Mispred rate:            0.0% (          0.0%     +       1.2%   ) 
==32555== Branches:        655,996,082  (  655,960,160 cond +  35,922 ind) ==32555== Mispredicts:     164,073,152  (  164,072,692 cond +     460 ind) ==32555== Mispred rate:           25.0% (         25.0%     +     1.2%   ) 
          Bc    Bcm Bi Bim       10,001      4  0   0      for (unsigned i = 0; i < 10000; ++i)            .      .  .   .      {            .      .  .   .          // primary loop  327,690,000 10,016  0   0          for (unsigned c = 0; c < arraySize; ++c)            .      .  .   .          {  327,680,000 10,006  0   0              if (data[c] >= 128)            0      0  0   0                  sum += data[c];            .      .  .   .          }            .      .  .   .      } 
          Bc         Bcm Bi Bim       10,001           4  0   0      for (unsigned i = 0; i < 10000; ++i)            .           .  .   .      {            .           .  .   .          // primary loop  327,690,000      10,038  0   0          for (unsigned c = 0; c < arraySize; ++c)            .           .  .   .          {  327,680,000 164,050,007  0   0              if (data[c] >= 128)            0           0  0   0                  sum += data[c];            .           .  .   .          }            .           .  .   .      } 
perf stat ./sumtest_sorted 
 Performance counter stats for './sumtest_sorted':    11808.095776 task-clock                #    0.998 CPUs utilized                    1,062 context-switches          #    0.090 K/sec                               14 CPU-migrations            #    0.001 K/sec                              337 page-faults               #    0.029 K/sec                   26,487,882,764 cycles                    #    2.243 GHz                     41,025,654,322 instructions              #    1.55  insns per cycle          6,558,871,379 branches                  #  555.455 M/sec                          567,204 branch-misses             #    0.01% of all branches            11.827228330 seconds time elapsed 
 Performance counter stats for './sumtest_unsorted':    28877.954344 task-clock                #    0.998 CPUs utilized                    2,584 context-switches          #    0.089 K/sec                               18 CPU-migrations            #    0.001 K/sec                              335 page-faults               #    0.012 K/sec                   65,076,127,595 cycles                    #    2.253 GHz                     41,032,528,741 instructions              #    0.63  insns per cycle          6,560,579,013 branches                  #  227.183 M/sec                    1,646,394,749 branch-misses             #   25.10% of all branches            28.935500947 seconds time elapsed 
perf record -e branch-misses ./sumtest_unsorted perf annotate -d sumtest_unsorted 
 Percent |      Source code & Disassembly of sumtest_unsorted ------------------------------------------------ ...          :                      sum += data[c];     0.00 :        400a1a:       mov    -0x14(%rbp),%eax    39.97 :        400a1d:       mov    %eax,%eax     5.31 :        400a1f:       mov    -0x20040(%rbp,%rax,4),%eax     4.60 :        400a26:       cltq        0.00 :        400a28:       add    %rax,-0x30(%rbp) ... 

Questions similaires