最快的素數篩選法

 http://bailuzhou.blog.163.com/blog/static/536613592007101213946727/

 
題目有點大, 主要爲了便於檢索到, 小素數篩選法通常採用eratosthenes方法,複雜度
 
爲 n * lglgn, 不同實現性能往往差別很大, 看過網上不少人寫的篩選法, 大多比較初級。
 
本文采用分段篩選, 多線程, 彙編, 位壓縮等優化手段。在雙核CPU 2.0G上2秒算出 2^31
 
 以內素數個數, 目前還沒有實現存儲,下個版本實現之。
 
/******************************************************************
Copyright (C) 2007 Huang Yuanbing
version 1.3, 2007 PrimeNumber
mailto: bailuzhou.dot.163.com
free use for non-commercial purposes
******************************************************************
 
describition:
input two number beg, end (0 < beg <= end < 2^32)
claculate number of primes in the interval [beg, end]
for emxaple:
PI[1, 2] = 1, only 2 is prime in this subinterval
PI[1, 2^31] = 105097565, there are 105097565 primes in this interval
 
basic ideal:
there are two algorithms in this file, as for the first
algorithm (ALGORITHM = 1), I do not want to describe the ideal here,
even this algorithm is about (30 - 50)% faster the second one for
lager input interval, but it has a very bad performance for many case
and also can not save each primes if you want them.
 
as for the second one (ALGORITHM = 2):
this is a improved segmented sieve of eratosthenes,
using multi thread and asm, bit compress to optimize the classical alrorithm.
for save memory and performance, primes are not saved, but it's easy to do that.
for a given subinterval[beg, end] and runing thread numbers pn the ith thread
calculate subinterval [beg + (end - beg) / pn * (i - 1), beg + (end - beg) / pn * i),
(pay attention the last thead) and the main thread add the result which get from
the other thread.
 
the complextity for function PI(0, n) is n * ln(lnx) and space complextity is sqrt(n)
this algorithm can also be implemented by MPI, openMP and multi proceess(IPC)
(just finished some parts).
 
this source file can be compiled by gcc and vc++
and running on windows and unix and linux(without marco ASM)
for some older cpu the marco ASM can induces bad performance.
there are may some bugs for it has not been fully tested.
at present the fastest prime algorithme as i know is ecprime you can get it from website:
http://www.primzahlen.de/files/referent/kw/ecprime.source.zip
******************************************************************
******************************************************************/
 
# include <stdio.h>
# include <stdlib.h>
# include <time.h>
# include <math.h>
# include <memory.h>
# include <assert.h>
 
#define ALGORITHM 2
// the algorithm 1
#define MULTI_THREAD
// multi-core CPU optimaztion
#define ASM0
// assemble optimaztion
 
#define COMP 4
#define MASK 7
#define MAXP 6600
#if (COMP == 3)
 #define MASKN(n) (1 << (n & MASK))
#else
 #define MASKN(n) (1 << ((n >> 1) & MASK))
#endif
 
#define THREAD_NUM 4
//the running threads
 
#define DEFAULT_N ((1u << 31) - 1)
//the input range [0, DEFAULT_N]
 
typedef unsigned int uint;
unsigned char bits[1 << 8];
   int Prime[MAXP]; // 0 - 2^16
 
int sieve(int);
int PI(int, int);
 
#ifdef MULTI_THREAD
 
struct ThreadInfo
{
 int beg, end;
 int pnums;
}Threadparam[THREAD_NUM * 2 + 2];
 
#ifdef _WIN32
 # include <windows.h>
DWORD WINAPI Win32ThreadFun(LPVOID pinfo)
#else
 # include <pthread.h>
void* POSIXThreadFun(void *pinfo)
#endif
{
 ThreadInfo *pThreadInfo = (ThreadInfo *) (pinfo);
 pThreadInfo->pnums = PI(pThreadInfo->beg, pThreadInfo->end);
 printf("PI[%10d, %10d] = %d\n", pThreadInfo->beg, pThreadInfo->end, pThreadInfo->pnums);
 return 0;
}
 
int init_pm(int tpnums, uint maxn, int blocksize)
{
 Threadparam[0].end = (maxn / tpnums) - (maxn / tpnums) % blocksize
  + (tpnums - 1) * blocksize * tpnums / 2;
 int bsize = Threadparam[0].end;
 for (int i = 1; i < tpnums; i++){
  Threadparam[i].beg = Threadparam[i - 1].end;
  Threadparam[i].end = Threadparam[i].beg + (bsize -= tpnums * blocksize);
 }
 Threadparam[0].beg = 2;
 Threadparam[tpnums - 1].end = maxn;
 return 0;
}
 
int multiThread(int theadnums, uint maxn)
{
 int i, pnums = 0;
#if ALGORITHM == 1
 for (i = 1; i < theadnums; i++)
  Threadparam[i].beg = Threadparam[i - 1].end = (maxn / theadnums) * i;
 Threadparam[0].beg = 2;
 Threadparam[theadnums - 1].end = maxn;
#else
 init_pm(theadnums, maxn, 255255 << 2);
#endif
 
#ifdef _WIN32
 HANDLE tHand[THREAD_NUM * 2];
 DWORD threadID[THREAD_NUM * 2];
 for (i = 0; i < theadnums; i++){
  tHand[i] = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)Win32ThreadFun, (LPVOID)(&Threadparam[i]), 0, &threadID[i]);
  if (tHand[i] == NULL)
   printf("create Win32 thread error\n");
 }
 WaitForMultipleObjects(theadnums, tHand, true, INFINITE);
 for (i = 0; i < theadnums; i++){
  pnums += Threadparam[i].pnums;
  CloseHandle(tHand[i]);
 }
#else
 pthread_t tid[THREAD_NUM * 2];
 for (i = 0; i < theadnums; i++){
  int error = pthread_create(&tid[i], NULL, POSIXThreadFun, &Threadparam[i]);
  if ( error != 0 )
   printf("Create pthread error: %d\n", error);
 }
 for (i = 0; i < theadnums; i++){
  pthread_join(tid[i], NULL);
  pnums += Threadparam[i].pnums;
 }
#endif
 return pnums;
}
#endif
 
#ifdef ASM
void asmSetBits(unsigned char mask[], int next, int len, int p)
{
#ifdef _CONSOLE
 __asm
 {
#if (COMP == 4 || ALGORITHM == 1)
  //    mov esi, len
  mov edx, p
  mov esi, mask
  mov eax, next
$loop1:
  mov edi, eax
  mov ecx, eax
#if (ALGORITHM != 1 && COMP == 4)
  shr edi, COMP
  shr ecx, 1
#else
  shr edi, 3
#endif
  mov ebx, 1
  and ecx, MASK
  shl ebx, cl
  or byte ptr [esi + edi], bl
  add eax, edx
  cmp eax, len
  jl $loop1
#else
  mov ebx, [mask]
  mov esi, len
  mov eax, next
  mov edi, p
$loop2:
  bts [ebx], eax
  add eax, edi
  cmp eax, esi
  jl $loop2
#endif
 }
#else
 __asm
 (
#if (COMP == 4 || ALGORITHM == 1)
   "movl %1, %%esi\n"
   "movl %4, %%edx\n"
   "movl %2, %%eax\n"
"Loop1:\n"
   "movl %%eax, %%edi\n"
   "movl %%eax, %%ecx\n"
#if (ALGORITHM != 1 && COMP == 4)
   "shrl $4, %%edi\n"
   "shrb $1, %%cl\n"
#else
   "shrl $3, %%edi\n"
#endif
   "movb $1, %%bl\n"
   "andb $7, %%cl\n"
   "shlb %%cl, %%bl\n"
   "orb  %%bl, (%%esi, %%edi)\n"
   "addl %%edx, %%eax\n"
   "cmpl %3, %%eax\n"
   "jl Loop1\n"
#else
   "leal (%1), %%eax\n"
   "movl %3,  %%esi\n"
   "movl %2, %%ebx\n"
   "movl %4, %%edi\n"
"loop2:\n"
   "btsl %%ebx, (%%eax)\n"
   "addl %%edi, %%ebx\n"
   "cmpl %%esi, %%ebx\n"
   "jl loop2\n"
#endif
   : "=m" (p)
   : "r" (mask), "g" (next), "g" (len),"g" (p)
   : "ax", "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi"
 );
#endif
 //    return;
}
 
int asmSum(const unsigned char mask[], int len)
{
 int pnums = 0;
#ifdef _CONSOLE
 __asm
 {
  mov esi, mask
  mov ecx, len
  xor edx, edx
  xor eax, eax
  xor ebx, ebx
$loopsum:
  mov al, byte ptr [esi + ecx]
  mov bl, byte ptr bits[eax]
  add edx, ebx
  loop $loopsum
  mov pnums, edx
 }
#else
 __asm
 (
   "movl %1, %%ecx\n"
   "xorl %%edx, %%edx\n"
   "movl %2, %%edi\n"
   "movl %3, %%esi\n"
   "xorl %%eax, %%eax\n"
   "xorl %%ebx, %%ebx\n"
"loopsum:\n"
   "movb (%%edi, %%ecx), %%al\n"
   "movb (%%esi, %%eax), %%bl\n"
   "addl %%ebx, %%edx\n"
   "loop loopsum\n"
   "movl %%edx, %0\n"
   : "=m" (pnums)
   : "g" (len), "g" (mask), "g"(bits)
   : "ax", "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi"
 );
#endif
 return pnums;
}
#endif
 
/************************************************************************/
/*                          ALGORITHM1                                  */
/************************************************************************/
#if (ALGORITHM == 1)
 
#define SP 7
#define BLOCKSIZE 30030
 
unsigned char BaseTpl[(1 << 16) >> COMP];
int mprimes[MAXP];
 
void solve(int a, int b, int c, int &x, int &y)
{
 if (a == 0){
  x = 0;
  y = c / b;
 }else{
  solve(b % a, a, c, y, x);
  x -= b / a * y;
 }
}
 
int init(int ans[], int beg, int last)
{
#if 1
 if ((beg & 1) == 0 || BaseTpl[beg >> COMP] & MASKN(beg))
  return 1;
#else
 for (int i = 1; i < SP; i++){
  if (beg % Prime[i] == 0)
   return 1;
 }
#endif
 
#if 1
 for (int k = SP, dis = beg - last; k <= Prime[0]; ++k)
  ans[k] = (ans[k] + mprimes[k] * dis) % Prime[k];
#else
 __asm
 {
  mov ebx, SP
  mov ecx, beg
  sub ecx, last
  mov esi, ans
  mov edi, Prime[0]
  shl edi, 2
$LOOP:
  mov eax, [mprimes + ebx]
  imul ecx
  add eax, [esi + ebx]
  idiv [Prime  + ebx]
  mov [esi + ebx], edx
  add ebx, 4
  cmp ebx, edi
  jle $LOOP
 }
#endif
 return 0;
}
 
int PI(int beg, int end)
{
 int ans[MAXP] = {0}; 
 int pnums = 0, last = 0;
 for (; beg < end; beg++){
  if (init (ans, beg, last))
   continue;
  unsigned char mask[MAXP * 2] = {0};
  uint len = (DEFAULT_N - (last = beg)) / BLOCKSIZE;
  for (int k = SP; k <= Prime[0]; ++k){
   const uint p = Prime[k];
   uint next = (ans[k] == 0) ? p - 1 : ans[k] - 1;
#ifndef ASM
   for (; next < len; next += p)
    mask[next >> 3] |= 1 << (next & MASK);
#else
   if (next < len)
    asmSetBits(mask, next, len, p);
#endif
  }
  int size = (DEFAULT_N / BLOCKSIZE + 7) >> 3;
  pnums -= bits[mask[0]];
#ifndef ASM
  for (int kk = 1; kk < size; ++kk)
   len -= bits[mask[kk]];
  pnums += len;
#else
  pnums += len - asmSum(mask, size);
#endif
 }
 return pnums;
}
 
int main(int arg, char **argc)
{
 clock_t tstart = clock();
 
 int i, pnums;
 sieve(DEFAULT_N);
 //init bits
 for (i = 1; i < (int)(sizeof(bits) / sizeof(bits[0])); ++i)
  bits[i] = bits[i >> 1] + (i & 1);
 
 for (i = SP; i <= Prime[0]; ++i){
  solve(Prime[i], -BLOCKSIZE, 1, mprimes[0],  mprimes[i]);
  mprimes[i] = (mprimes[i] % (int)Prime[i] + Prime[i]) % Prime[i];
 }
#ifdef MULTI_THREAD
 if (arg > 1)
  pnums = multiThread(atoi(argc[1]), BLOCKSIZE);
 else
  pnums = multiThread(THREAD_NUM, BLOCKSIZE);
#else
 pnums = PI(0, BLOCKSIZE);
#endif
 printf("PI1[%u] : primes = %d, time use %ld ms\n", (uint)DEFAULT_N, (pnums + Prime[0]), clock() - tstart);
 return 0;
}
 
#else
 
#ifdef S6
 #define FACT 15015 * 16 // 3 * 5 * 7 * 11 * 13 = 15015
 #define SP 6
#else
 #define FACT 255255     // 3 * 5 * 7 * 11 * 13 * 17 = 255255
 #define SP 7
#endif
 
#define BLOCKSIZE (FACT << 2)
#define MAXM ((BLOCKSIZE >> COMP) + 2)
unsigned char BaseTpl[MAXM]; //the table the fist SP primes is removed
unsigned char bitsIndex[1 << 8][5] = {0};
 
/******************************
// 1  3  5  7  9 11 13 15
// 0  1  1  1  1  1  1  1  6    0xfe  mask[0] 
//17 19 21 23 25 27 29 31
// 1  1  0  1  0  0  1  1  5    0xca  mask[1]
//33 35 37 39 41 43 45 47
// 0  0  1  0  1  1  0  1  4    0xb4  mask[2]
//49 51 53 55 57 59 61 63 
// 0  0  1  0  0  1  1  0  3    0x64  mask[3]
******************************/
 
//#include <assert.h>
int outPrint(unsigned char mask[], int start, uint len)
{
 for (uint i = 0; i < len; i += 1 << COMP){
  int bi = i >> COMP;
#if 0
  assert(bits[mask[bi]] <= 5);
  for (uint j = i + 1; j < i + 16; j += 2){
   if ( !(mask[bi] & MASKN(j)) )
    printf("%d ", start + (bi << COMP) + 2 * ((j >> 1) & 7) + 1);
  }
#else
  if (bitsIndex[mask[bi]][0] == 0)
   continue;
  for (uint j = mask[bi], k = 0; bitsIndex[j][k]; k++)
   printf("%d ", start + (bi << COMP) + 2 * bitsIndex[j][k] - 1);
#endif
  putchar('\n');
 }
 return 0;
}
 
int piRange(int start, uint len = BLOCKSIZE)
{
 int srid = start % BLOCKSIZE;
 len += srid;
 
 uint next, pnums = 0;
 bool ok = start >= BLOCKSIZE;
 const int maxp = (int)sqrt((float)(start + len)) + 1;
 unsigned char mask[MAXM + 4];
 memcpy(mask, BaseTpl, (len >> COMP) + 1);
 mask[len >> COMP] |= ~(MASKN(len) - 1);
 if (srid)
  mask[srid >> COMP] |= (MASKN(srid) - 1);
 
 for (int i = SP + 1, beg = srid - 1, p = Prime[i]; p < maxp; p = Prime[++i]){
  if (ok){
   next = beg + p - (start - 1) % p;
   if ((next & 1) == 0)
    next += p;
  }else
   next = p * p;
  p <<= 1;
#ifdef ASM
  if (next < len)
   asmSetBits(mask , next, len , p);
#else
  for (; next < len; next += p)
   mask[next >> COMP] |= MASKN(next);
#endif
 }
 int size = len >> COMP;
 for (int k = (srid >> COMP); k <= size; k++)
  pnums += bits[mask[k]];
 return pnums;
}
 
 
int P7(int beg, int end)
{
 int pnums = 0;
#if 0
 for (int j = SP; beg <= Prime[j] && j > 0; j--){
  if (end >= Prime[j])
   pnums++;
 }
#else
//            2  3     5     7          11    13           17   19
 static int Prime19[20] = {0, 0, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 8};
 if (end > Prime[SP])
  pnums += Prime19[Prime[SP]] - Prime19[beg - 1];
 else
  pnums += Prime19[end] - Prime19[beg - 1];
#endif
 return pnums;
}
 
int PI(int beg, int end)
{
 int pnums = 0;
 if (beg > end || end > DEFAULT_N)
  return -1;
 if (beg < 2)
  beg = 2;
 if (beg <= Prime[SP])
  pnums = P7(beg, end);
 if (beg / BLOCKSIZE == end / BLOCKSIZE){
  pnums += piRange(beg, end - beg + 1);
  return pnums;
 }
 int size = end % BLOCKSIZE;
 if (size != 0)
  pnums += piRange(end - size, size + 1);
 size = beg % BLOCKSIZE;
 if (size != 0){
  pnums += piRange(beg, BLOCKSIZE - size);
  beg += BLOCKSIZE - size;
 }
 beg /= BLOCKSIZE, end /= BLOCKSIZE;
 for (int i = beg; i < end; i++){
  static int pcache[DEFAULT_N / BLOCKSIZE + 2] = {0};
  if (pcache[i] == 0)
   pcache[i] = piRange(i * BLOCKSIZE);
  pnums += pcache[i];
 }
 return pnums;
}
 
void initBits()
{
 int i;
 int bitsize = sizeof(bits) / sizeof(bits[0]);
 for (i = 1; i < bitsize; i++)
  bits[i] = bits[i >> 1] + (i & 1);
 for (i = 0; i < bitsize; i++)
  bits[i] = 8 - bits[i];
 
 for (i = 0; i < bitsize; i++){
  int num = i, cnt = 0;
  if (bits[num] > 5)
   continue;
  for (int j = 0; num; j++){
   if ( !(num & 1) )
    bitsIndex[i][cnt++] = j + 1;
   num >>= 1;
  }
//  printf("i = %d, cnt = %d\n", i, cnt);
 }
}
 
#include <windows.h>
 
int main(int arg, char **argc)
{
 clock_t tstart = clock();
 int pnums = 0;
 initBits();
 sieve(DEFAULT_N);
 
#ifdef MULTI_THREAD
    if (DEFAULT_N > THREAD_NUM * BLOCKSIZE){
  if (arg > 1)
   pnums = multiThread(atoi(argc[1]), DEFAULT_N);
  else
   pnums = multiThread(THREAD_NUM, DEFAULT_N);
 }else
  pnums = PI(0, DEFAULT_N);
#else
 pnums = PI(0, DEFAULT_N);
#endif
 printf("PI2[0 - %u] : primes = %d, time use %ld ms\n", (uint)DEFAULT_N, pnums, clock() - tstart);
 
 //for test the result
 int beg, end;
 while (scanf ("%d %d", &beg, &end) == 2 && beg <= end){
  tstart = clock();
  printf("PI[%d, %d] = ", beg, end);
  pnums = PI(beg, end);
  printf("%d, time use %d ms\n", pnums, clock() - tstart);
 
 }
 return 0;
}
#endif
 
int sieve(int maxn)
{
 uint p, primes = 1;
 uint maxp = (uint)sqrt((float)maxn) + 19;
 
#if (ALGORITHM == 1)
 if (maxp < BLOCKSIZE)
  maxp = BLOCKSIZE + 10;
#endif
 Prime[1] = 2;
 for (p = 3; p < maxp; p += 2){
  if ( !(BaseTpl[p >> COMP] & MASKN(p)) ){
   Prime[++primes] = p;
   for (uint j = p * p; j < maxp; j += p << 1)
    BaseTpl[j >> COMP] |= MASKN(j);
  }
 }
#if (COMP == 4)
 memset(BaseTpl, 0, sizeof(BaseTpl));
#else
 memset(BaseTpl, 0x55, sizeof(BaseTpl));
#endif
 for (int i = 2; i <= SP; i++){
//  printf("%d\n", Prime[i]);
  for (int j = Prime[i], p = j; j < BLOCKSIZE; j += p << 1)
   BaseTpl[j >> COMP] |= MASKN(j);//printf("j = %d\n", j);
 }
 Prime[primes + 1] = maxn;
 Prime[0] = primes;
 return primes;
}

源地址: http://www.mwtee.com/blog-18294-2560.html

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章