added 57 characters in body
Source Link
user2297550
  • 3.2k
  • 3
  • 28
  • 41

The assumption by other answers that one needs to sort the data is not correct.

The following code does not sort the entire array, but only 200-element segments of it, and thereby runs the fastest.

Sorting only k-element sections completes the pre-processing in linear time, O(n), rather than the O(n.log(n)) time needed to sort the entire array.

#include <algorithm>
#include <ctime>
#include <iostream>

int main() {
    int data[32768]; const int l = sizeof data / sizeof data[0];

    for (unsigned c = 0; c < l; ++c)
        data[c] = std::rand() % 256;

    // sort 200-element segments, not the whole array
    for (unsigned c = 0; c + 200 <= l; c += 200)
        std::sort(&data[c], &data[c + 200]);

    clock_t start = clock();
    long long sum = 0;

    for (unsigned i = 0; i < 100000; ++i) {
        for (unsigned c = 0; c < sizeof data / sizeof(int); ++c) {
            if (data[c] >= 128)
                sum += data[c];
        }
    }

    std::cout << static_cast<double>(clock() - start) / CLOCKS_PER_SEC << std::endl;
    std::cout << "sum = " << sum << std::endl;
}

This also "proves" that it has nothing to do with any algorithmic issue such as sort order, and it is indeed branch prediction.

The assumption by other answers that one needs to sort the data is not correct.

The following code does not sort the entire array, but only 200-element segments of it, and thereby runs fastest.

Sorting only k-element sections completes the pre-processing in linear time rather than n.log(n).

#include <algorithm>
#include <ctime>
#include <iostream>

int main() {
    int data[32768]; const int l = sizeof data / sizeof data[0];

    for (unsigned c = 0; c < l; ++c)
        data[c] = std::rand() % 256;

    // sort 200-element segments, not the whole array
    for (unsigned c = 0; c + 200 <= l; c += 200)
        std::sort(&data[c], &data[c + 200]);

    clock_t start = clock();
    long long sum = 0;

    for (unsigned i = 0; i < 100000; ++i) {
        for (unsigned c = 0; c < sizeof data / sizeof(int); ++c) {
            if (data[c] >= 128)
                sum += data[c];
        }
    }

    std::cout << static_cast<double>(clock() - start) / CLOCKS_PER_SEC << std::endl;
    std::cout << "sum = " << sum << std::endl;
}

This also "proves" that it has nothing to do with any algorithmic issue such as sort order, and it is indeed branch prediction.

The assumption by other answers that one needs to sort the data is not correct.

The following code does not sort the entire array, but only 200-element segments of it, and thereby runs the fastest.

Sorting only k-element sections completes the pre-processing in linear time, O(n), rather than the O(n.log(n)) time needed to sort the entire array.

#include <algorithm>
#include <ctime>
#include <iostream>

int main() {
    int data[32768]; const int l = sizeof data / sizeof data[0];

    for (unsigned c = 0; c < l; ++c)
        data[c] = std::rand() % 256;

    // sort 200-element segments, not the whole array
    for (unsigned c = 0; c + 200 <= l; c += 200)
        std::sort(&data[c], &data[c + 200]);

    clock_t start = clock();
    long long sum = 0;

    for (unsigned i = 0; i < 100000; ++i) {
        for (unsigned c = 0; c < sizeof data / sizeof(int); ++c) {
            if (data[c] >= 128)
                sum += data[c];
        }
    }

    std::cout << static_cast<double>(clock() - start) / CLOCKS_PER_SEC << std::endl;
    std::cout << "sum = " << sum << std::endl;
}

This also "proves" that it has nothing to do with any algorithmic issue such as sort order, and it is indeed branch prediction.

edited body
Source Link
user2297550
  • 3.2k
  • 3
  • 28
  • 41

The assumption by other answers that one needs to sort the data is not correct.

The following code does not sort the entire array, but only 200-element segments of it, and thereby runs fastest.

Sorting only k-element sections completes the pre-processing in linear time rather than n.log(n).

#include <algorithm>
#include <ctime>
#include <iostream>

int main() {
    int data[32768]; const int l = sizeof data / sizeof data[0];

    for (unsigned c = 0; c < l; ++c)
        data[c] = std::rand() % 256;

    // sort 200-element segments, not the whole array
    for (unsigned c = 0; c + 200 <= l; c += 200)
        std::sort(&data[c], &data[c + 200]);

    clock_t start = clock();
    long long sum = 0;

    for (unsigned i = 0; i < 100000; ++i) {
        for (unsigned c = 0; c < sizeof data / sizeof(int); ++c) {
            if (data[c] >= 128)
                sum += data[c];
        }
    }

    std::cout << static_cast<double>(clock() - start) / CLOCKS_PER_SEC << std::endl;
    std::cout << "sum = " << sum << std::endl;
}

Sorting only k-element sections completes the pre-processing in linear time rather than n.log(n).

This also "proves" that it has nothing to do with any algorithmic issue such as sort order, and it is indeed branch prediction.

The assumption by other answers that one needs to sort the data is not correct.

The following code does not sort the entire array, but only 200-element segments of it, and thereby runs fastest.

#include <algorithm>
#include <ctime>
#include <iostream>

int main() {
    int data[32768]; const int l = sizeof data / sizeof data[0];

    for (unsigned c = 0; c < l; ++c)
        data[c] = std::rand() % 256;

    // sort 200-element segments, not the whole array
    for (unsigned c = 0; c + 200 <= l; c += 200)
        std::sort(&data[c], &data[c + 200]);

    clock_t start = clock();
    long long sum = 0;

    for (unsigned i = 0; i < 100000; ++i) {
        for (unsigned c = 0; c < sizeof data / sizeof(int); ++c) {
            if (data[c] >= 128)
                sum += data[c];
        }
    }

    std::cout << static_cast<double>(clock() - start) / CLOCKS_PER_SEC << std::endl;
    std::cout << "sum = " << sum << std::endl;
}

Sorting only k-element sections completes the pre-processing in linear time rather than n.log(n).

This also "proves" that it has nothing to do with any algorithmic issue such as sort order, and it is indeed branch prediction.

The assumption by other answers that one needs to sort the data is not correct.

The following code does not sort the entire array, but only 200-element segments of it, and thereby runs fastest.

Sorting only k-element sections completes the pre-processing in linear time rather than n.log(n).

#include <algorithm>
#include <ctime>
#include <iostream>

int main() {
    int data[32768]; const int l = sizeof data / sizeof data[0];

    for (unsigned c = 0; c < l; ++c)
        data[c] = std::rand() % 256;

    // sort 200-element segments, not the whole array
    for (unsigned c = 0; c + 200 <= l; c += 200)
        std::sort(&data[c], &data[c + 200]);

    clock_t start = clock();
    long long sum = 0;

    for (unsigned i = 0; i < 100000; ++i) {
        for (unsigned c = 0; c < sizeof data / sizeof(int); ++c) {
            if (data[c] >= 128)
                sum += data[c];
        }
    }

    std::cout << static_cast<double>(clock() - start) / CLOCKS_PER_SEC << std::endl;
    std::cout << "sum = " << sum << std::endl;
}

This also "proves" that it has nothing to do with any algorithmic issue such as sort order, and it is indeed branch prediction.

deleted 150 characters in body
Source Link
user2297550
  • 3.2k
  • 3
  • 28
  • 41

Although, as other answers have mentioned, modern compilers or architectures (ARM) make this specific example moot, in the general case, theThe assumption by other answers that one needs to sort the data is not entirely correct.

The following code does not sort the entire array, but only 200-element segments of it, and thereby runs fastest.

#include <algorithm>
#include <ctime>
#include <iostream>

int main() {
    int data[32768]; const int l = sizeof data / sizeof data[0];

    for (unsigned c = 0; c < l; ++c)
        data[c] = std::rand() % 256;

    // sort 200-element segments, not the whole array
    for (unsigned c = 0; c + 200 <= l; c += 200)
        std::sort(&data[c], &data[c + 200]);

    clock_t start = clock();
    long long sum = 0;

    for (unsigned i = 0; i < 100000; ++i) {
        for (unsigned c = 0; c < sizeof data / sizeof(int); ++c) {
            if (data[c] >= 128)
                sum += data[c];
        }
    }

    std::cout << static_cast<double>(clock() - start) / CLOCKS_PER_SEC << std::endl;
    std::cout << "sum = " << sum << std::endl;
}

Sorting only k-element sections completes the pre-processing in linear time rather than n.log(n) general case.

This also "proves" that it has nothing to do with any algorithmic issue such as sort order, and it is indeed branch prediction.

Although, as other answers have mentioned, modern compilers or architectures (ARM) make this specific example moot, in the general case, the assumption by other answers that one needs to sort the data is not entirely correct.

The following code does not sort the entire array, but only 200-element segments of it, and thereby runs fastest.

#include <algorithm>
#include <ctime>
#include <iostream>

int main() {
    int data[32768]; const int l = sizeof data / sizeof data[0];

    for (unsigned c = 0; c < l; ++c)
        data[c] = std::rand() % 256;

    // sort 200-element segments, not the whole array
    for (unsigned c = 0; c + 200 <= l; c += 200)
        std::sort(&data[c], &data[c + 200]);

    clock_t start = clock();
    long long sum = 0;

    for (unsigned i = 0; i < 100000; ++i) {
        for (unsigned c = 0; c < sizeof data / sizeof(int); ++c) {
            if (data[c] >= 128)
                sum += data[c];
        }
    }

    std::cout << static_cast<double>(clock() - start) / CLOCKS_PER_SEC << std::endl;
    std::cout << "sum = " << sum << std::endl;
}

Sorting only k-element sections completes the pre-processing in linear time rather than n.log(n) general case.

This also "proves" that it has nothing to do with any algorithmic issue such as sort order, and it is indeed branch prediction.

The assumption by other answers that one needs to sort the data is not correct.

The following code does not sort the entire array, but only 200-element segments of it, and thereby runs fastest.

#include <algorithm>
#include <ctime>
#include <iostream>

int main() {
    int data[32768]; const int l = sizeof data / sizeof data[0];

    for (unsigned c = 0; c < l; ++c)
        data[c] = std::rand() % 256;

    // sort 200-element segments, not the whole array
    for (unsigned c = 0; c + 200 <= l; c += 200)
        std::sort(&data[c], &data[c + 200]);

    clock_t start = clock();
    long long sum = 0;

    for (unsigned i = 0; i < 100000; ++i) {
        for (unsigned c = 0; c < sizeof data / sizeof(int); ++c) {
            if (data[c] >= 128)
                sum += data[c];
        }
    }

    std::cout << static_cast<double>(clock() - start) / CLOCKS_PER_SEC << std::endl;
    std::cout << "sum = " << sum << std::endl;
}

Sorting only k-element sections completes the pre-processing in linear time rather than n.log(n).

This also "proves" that it has nothing to do with any algorithmic issue such as sort order, and it is indeed branch prediction.

Source Link
user2297550
  • 3.2k
  • 3
  • 28
  • 41
Loading