/* emc.c -- Integer factorization using the Elliptic Curve Method
   See http://www.loria.fr/~zimmerma/records/ecmnet.html

  Copyright (C) 1998 Paul Zimmermann, INRIA Lorraine, zimmerma@loria.fr
  See http://www.loria.fr/~zimmerma/records/ecmnet.html

  This program is free software; you can redistribute it and/or modify it
  under the terms of the GNU General Public License as published by the
  Free Software Foundation; either version 2 of the License, or (at your
  option) any later version.

  This program is distributed in the hope that it will be useful, but WITHOUT
  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  more details.

  You should have received a copy of the GNU General Public License along
  with this program; see the file COPYING.  If not, write to the Free
  Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  02111-1307, USA.

Changes with respect to 2a:
- use base-2 division for factors of 2^k+/-1
- added division using pre-inversion (macro PREINVERT): no saving
- now prints a warning for probable prime input
- now checks if factors found are composite or not
- now checks for prime powers
Changes with respect to 2b:
- saved a factor of two in step 2 initialization, and a factor of two
  in memory needed by step 2
- changed B2 and m in step 2 to be double's --> no overflow any more,
  even on 32-bit machines.
- fixed bug for multiple-line input (thanks to Torbjorn and P. Leyland)
Changes with respect to 2c:
- added LARGE macro for large input, like Fermat numbers, to disable
  printing of input number, values of A and x
- now does primality and perfect-power tests only with CHECK=1 (default 0)
Changes with respect to 2d:
- no normalization by default in step 1
- no gcd after each prime in step 1 (only one at the end of step 1),
   unless GROUPORDER is defined (useful for computing group order)
- no gcd after each giant-step in phase 2 (only one at the end), unless
   GROUPORDER is defined
- in step 2, if NOGCD if defined, does no gcd (3 times more multiplications)
  ==> useful for Fermat numbers
- included changes from Alex Stuebinger for ANSIONLY compilers (08/04/98)
- LARGE=1 by default (does not print A=..., x=...)
- removed compare2 (always use base-2 division for divisors of 2^n+/-1)
Changes with respect to 2f:
- Change several variables from type double to just int.
- Change sieve table (pr) to have values 0 and 1, not '0' and '1'.
- Change pr to unsigned char (faster on alpha, sparc, etc).
- Avoid repeated %q in step2 (as well as step1).
- Sieve only on odd i's in pr[i] (step 1) and even i's in step 2
- Avoid expensive %q computations in step 2 (now only at initialization)
Changes with respect to 2g:
- combine k prime powers in step 1 and replace k muls by one inverse
To do:
- use Peter's trick to replace k gcdexts by one gcdext and (k-1) muls
  (both in step 1 and 2)
Changes with respect to 2h:
- removed PREINVERT stuff
- new add3 procedure: removed all mpz_set in multiply
- implemented Peter Montgomery's PRAC algorithm [4]
Changes with respect to 2i:
- implemented Peter Montgomery's MODMULN algorithm
Changes with respect to 2ia:
- completely rewritten step 2
- removed NOGCD stuff
Changes with respect to version 3a:
- added GNU General Public License
- removed REDC stuff
Changes in version 4 (April 1999):
- uses fast multipoint polynomial evaluation in step 2,
  together with O(n^1.59) Karatsuba multiplication
- uses k passes in step 2: cost [30+12*(k-1)]*M(n/2)
Changes in version 4a:
- cost of recip reduced to 9/2*M(n/2) instead of 6*M(n/2)
  i.e. cost of step 2 is now [57/2+12*(k-1)]*M(n/2)
- improved help

This version uses Montgomery's form (8) from [2] which avoids gcds:

        b*y^2*z = x^3 + a*x^2*z + x*z^2

References:
[1] "Speeding the Pollard and Elliptic Curve Methods of Factorization", by 
   Peter Montgomery, Math. of Comp. 48 (177), pages 243-264, January 1987.
[2] "Factorization of the tenth and eleventh Fermat numbers", by Richard Brent,
ftp://nimbus.anu.edu.au/pub/Brent/rpb161tr.dvi.gz
[3] Torbjorn Granlund, Peter L. Montgomery: Division by Invariant Integers 
    using Multiplication. PLDI 1994: 61-72, SIGPLAN Notices 29(6) (June 1994)
[4] "Evaluating recurrences of form X_{m+n} = f(X_m,X_n,X_{m-n}) via Lucas
     chains", Peter Montgomery, ftp.cwi.nl:/pub/pmontgom/Lucas.ps.gz

Examples (log and timing lines omitted):

% echo 137703491 | ecm 100 6
********** Factor found in step 1: 17389

(if compiled with -DGROUPORDER)
% echo 137703491 | ecm 100 13
********** Factor found in step 2: 7919

(bug found by T. Granlund in 1st version of ecm2f)
% echo 17061648125571273329563156588435816942778260706938821014533 | ecm 174000 585928442
********** Factor found in step 2: 4562371492227327125110177

From [2], page 15 (factorization of 55^126+1):
% echo 5394204444759808120647321820789847518754252780933425517607611172590240019087317088600360602042567541009369753816111824690753627535877960715703346991252857 | ecm 345551 805816989
********** Factor found in step 1: 25233450176615986500234063824208915571213

% ecm 314263 14152267 4677853 < F10.cofactor
Input number is 607820568181834328745927047401406785398975700821911559763928675076909152806525747797078707978021962487854849079350770968904705424125269800765765006449689562590686195386366153585734177565092347016126765195631310982002631912943551551593959032889971392442015624176361633631364310142874363629569
********** Factor found in step 2: 4659775785220018543264560743076778192897

# first Cunningham factor found by GMP-ECM (06 Dec 1997)
% echo 449590253344339769860648131841615148645295989319968106906219761704350259884936939123964073775456979170209297434164627098624602597663490109944575251386017 | ecm 1000000 63844855
********** Factor found in step 2: 241421225374647262615077397

# p48 found by Richard Brent on October 9, 1997
% echo 3923385745693995079670229419275984584311007321932374190635656246740175165573932140787529348954892963218868359081838772941945556717 | ecm 141667 876329474 150814537
********** Factor found in step 2: 662926550178509475639682769961460088456141816377

# p45 found by Richard Brent on October 24, 1997
% echo 89101594496537524661600025466303491594098940711325290746374420963129505171895306244425914080753573576861992127359576789001 | ecm 325001 877655087 1032299
********** Factor found in step 2: 122213491239590733375594767461662771175707001
*/

#include <ctype.h>
#include <math.h>
#include <stdio.h>
#ifndef IRIX /* conflict with gcc math.h */
#include <stdlib.h>
#endif
#include <string.h>
#include <time.h>


#ifndef ANSIONLY
#include <unistd.h>
#include <sys/times.h>
#include <sys/time.h>
#endif

#define MODMULN

#include "gmp.h"
#ifdef MODMULN
#include "gmp-impl.h"
mp_limb_t Nprim;
#endif

#ifndef max
#define max(a,b) (((a)>(b)) ? (a) : (b))
#define min(a,b) (((a)<(b)) ? (a) : (b))
#endif

#ifdef NORANDOM
long int random(void)
{  return abs(rand() ^ (rand()<<16));
}
#endif

#define LARGE 1 /* does not print A=..., x=... */
#ifdef GROUPORDER
#undef LARGE
#endif

/* ANSI Prototypes */
extern void printout(__mpz_struct *,__mpz_struct *);
extern int ecm(__mpz_struct *,__mpz_struct *,__mpz_struct *,double,int,int);
extern int step1(__mpz_struct *);
extern void initprimes(double,int );
extern void prac(unsigned int );
extern void add3(__mpz_struct *,__mpz_struct *,__mpz_struct *,__mpz_struct *,
		 __mpz_struct *,__mpz_struct *,__mpz_struct *,__mpz_struct *);
extern void duplicate(__mpz_struct *,__mpz_struct *,__mpz_struct *,__mpz_struct *);
extern int step2(__mpz_struct *,__mpz_struct *,unsigned int,double,
		 __mpz_struct *,__mpz_struct *,int,unsigned int);
extern int cputime(void);
extern int isbase2(__mpz_struct *,double);
extern void mod2plus(__mpz_struct *,__mpz_struct *,__mpz_struct *);
extern void mod2minus(__mpz_struct *,__mpz_struct *,__mpz_struct *);
extern void mpz_mod_n(__mpz_struct *,__mpz_struct *,__mpz_struct *);

typedef void (*mod_t) (mpz_t, mpz_t, mpz_t);

void (*mod)(mpz_t a,mpz_t b,mpz_t n)=NULL;
extern int multiplyW(__mpz_struct *,__mpz_struct *,__mpz_struct *,
		     __mpz_struct *,__mpz_struct *,unsigned int,__mpz_struct *,
		     __mpz_struct *,__mpz_struct *,__mpz_struct *);
extern int subW(__mpz_struct *,__mpz_struct *,__mpz_struct *,__mpz_struct *,
		__mpz_struct *,__mpz_struct *,__mpz_struct *,__mpz_struct *,
		__mpz_struct *,__mpz_struct *);
extern int addWn(__mpz_struct *,mpz_t *,mpz_t *,
		 __mpz_struct *,mpz_t *,mpz_t *,int);
extern int duplicateW(__mpz_struct *,__mpz_struct *,__mpz_struct *,
		      __mpz_struct *,__mpz_struct *,__mpz_struct *,
		      __mpz_struct *,__mpz_struct *,__mpz_struct *);
extern int addW(__mpz_struct *,__mpz_struct *,__mpz_struct *,__mpz_struct *,
		__mpz_struct *,__mpz_struct *,__mpz_struct *,__mpz_struct *,
		__mpz_struct *,__mpz_struct *);
extern void polyeval(mpz_t *,mpz_t **,mpz_t *,unsigned int);
extern void buildF(mpz_t **,mpz_t *,unsigned int);
extern double default_B2(double);
extern void recip(mpz_t*,mpz_t*,mpz_t*,unsigned int);
extern void polymul(mpz_t*,mpz_t*,mpz_t*,unsigned int,mpz_t*);
extern void polymul1(mpz_t*,mpz_t*,mpz_t*,unsigned int,mpz_t*);
extern void karatsuba(mpz_t*,mpz_t*,mpz_t*,unsigned int,mpz_t*);
extern void buildG(mpz_t*,mpz_t*,unsigned int);
extern void polymulmod(mpz_t*,mpz_t**,mpz_t*,int);

#ifndef ANSIONLY
#ifndef ULTRIX /* from Paul Leyland: otherwise produces a syntax error */
extern void srand48();
extern pid_t getpid();
struct tms ti;
#endif
#endif

/* global variables */
unsigned int bb,*prime,nbprimes,mul,gcdexts,lgn,verbose=0,base2=1;
mp_size_t sizen;
#ifdef COUNTADD
unsigned int cond1=0,cond2=0,cond3=0,cond4=0,cond5=0,cond6=0,cond7=0,cond8=0,cond9=0,dups=0;
#endif
unsigned int adds=0;
int B1,ispower2; long lrand48();
unsigned char *pr;
mpz_t a,b,n,u,v,w,x,z,x1,z1,x2,z2,one,y,invn,*Rmod,x3,z3,x4,z4;
mp_limb_t **Rmodd;

unsigned int nb_digits(n) mpz_t n;
{
   unsigned int size; char *str;
 
   str = mpz_get_str(NULL,10,n);
   size = strlen(str);
   free(str);
   return size;
}

void clear_all()
{
   mpz_clear(a); mpz_clear(b); mpz_clear(n); mpz_clear(u); mpz_clear(v);
   mpz_clear(w); mpz_clear(x); mpz_clear(x1); mpz_clear(z1); mpz_clear(x2);
   mpz_clear(z2); mpz_clear(z); mpz_clear(one); mpz_clear(y); mpz_clear(invn);
   mpz_clear(x3); mpz_clear(z3); mpz_clear(x4); mpz_clear(z4);
}

int main(argc,argv) int argc; char *argv[];
{
   int r,e=0,iter=0; char c='0'; mpz_t p,s; double B2;
   
   if (argc>1 && strcmp(argv[1],"-v")==0) {
     verbose=1;
     argv += 1; argc -= 1;
   }
   if (argc>1 && strcmp(argv[1],"-nobase2")==0) {
     base2=0;
     argv += 1; argc -= 1;
   }
   if (argc>1 && strcmp(argv[1],"-e")==0) {
     e = atoi(argv[2]);
     argv += 2; argc -= 2;
   }
   if (argc>1 && strcmp(argv[1],"-k")==0) {
     iter = atoi(argv[2]);
     argv += 2; argc -= 2;
   }
   if (argc<2) {
     printf("Usage: ecm [-v] [-nobase2] [-e n] [-k n] B1 [sigma [B2]] < file\n");
     printf("       ecm [-v] [-e n] [-k n] B1 A B2 x1 < file\n");
     printf("\n");
     printf("Parameters:\n");
     printf("  B1         step 1 limit\n");
     printf("  sigma      elliptic curve seed (0 gives random curve)\n");
     printf("  B2         step 2 limit (default is >= 100*B1)\n");
     printf("  A, x1      elliptic curve parameter and starting point\n");
     printf("  file       file of numbers to factor, one per line (or standard input)\n");
     printf("\n");
     printf("Options:\n");
     printf("  -v         verbose\n");
     printf("  -nobase2   disable special division for factors of 2^n+/-1\n");
     printf("  -e n       impose polynomial x^n for Brent-Suyama's extension\n");
     printf("  -k n       perform at most n passes in step 2\n");
     exit(1);
   }
   printf("GMP-ECM 4a, by P. Zimmermann (Inria), 20 Apr 1999, with contributions from\n");
   printf("T. Granlund, P. Leyland, C. Curry, A. Stuebinger, G. Woltman, JC. Meyrignac,\n");
   printf("and the invaluable help from P.L. Montgomery.\n");
   mpz_init(a); mpz_init(b); mpz_init(n); mpz_init(u); mpz_init(v);
   mpz_init(w); mpz_init(x); mpz_init(x1); mpz_init(z1); mpz_init(x2);
   mpz_init(z2); mpz_init(z); mpz_init(s); mpz_init(x3); mpz_init(z3);
   mpz_init(x4); mpz_init(z4);
   mpz_init_set_ui(one,1); mpz_init(p); mpz_init(y); mpz_init(invn);
   B1 = atoi(argv[1]);
   if (B1<0) { printf("Error: negative B1\n"); exit(1); }
   /* initialize table of primes */
   bb=0; initprimes((double)B1,0);
   /* s = (argc>=3) ? atoi(argv[2]) : 0; */
   if (argc>=3) mpz_set_str(s,argv[2],10); else mpz_set_ui(s,0);
   if (argc>=4) B2 = atof(argv[3]);
   else /* Default B2 */
       B2 = default_B2((double) B1);
   if (argc>=5) {
     mpz_set_str(a,argv[2],10); mpz_set_str(x,argv[4],10);
   }
   else mpz_set_ui(x,0);
#ifdef ANSIONLY
  if (mpz_cmp_ui(s,0)==0) {
    time_t timer;
    struct tm tp;
    time(&timer);
    tp = *localtime(&timer);
    srand(tp.tm_hour * 3600 + tp.tm_min * 60 + tp.tm_sec);
  }
#else
   if (mpz_cmp_ui(s,0)==0) {
#ifdef __DJGPP__
     srandom(time(NULL) + getpid()); /* thanks to Conrad Curry */
#else
     struct timeval tp;
     gettimeofday(&tp, NULL);
     srand48(65536 * tp.tv_sec + tp.tv_usec + getpid());
#endif
   }
#endif
   while (!feof(stdin)) {
     mpz_inp_str(n,stdin,0);
#ifndef VERYLARGE
      {
         char *str;
 	if (mpz_sizeinbase (n, 10) <= 1000)
 	  str = mpz_get_str(NULL,10,n);
 	else
 	  str = "too large for my taste\n";
        printf("Input number is %s", str);
 	if (mpz_size (n) <= 100)
 	  {
 	    printf(" (%u digits)\n",strlen(str)); fflush(stdout);
 	    free(str);
 	  }
      }
#endif
     if ((r=ecm(p,n,s,B2,e,iter))) {
       if (mpz_cmp(p,n)) {
	 if (mpz_probab_prime_p(p,25))
	   printf("Found probable prime factor");
	 else printf("Found COMPOSITE factor");
	 printf(" of %u digits: ",nb_digits(p));
	 mpz_out_str(stdout,10,p); putchar('\n');
         if (mpz_probab_prime_p(p,25) && nb_digits(p)>=47) {
	   printf("Report your potential champion to Richard Brent <rpb@comlab.ox.ac.uk>\n");
	   printf("(see ftp://ftp.comlab.ox.ac.uk/pub/Documents/techpapers/Richard.Brent/champs.txt)\n");
	 }
	 mpz_divexact(n,n,p);
	 if (mpz_probab_prime_p(n,25)) printf("Probable prime");
	 else printf("Composite");
	 printf(" cofactor ");
	 mpz_out_str(stdout,10,n);
	 printf(" has %u digits",nb_digits(n));
	   }
       else printf("Found input number N");
       printf("\n"); fflush(stdout);
     }
     while (!feof(stdin) && !isdigit(c=getchar()));
     /* exit with 0 iff a factor found for the last input. Allows to do:
	while ecm 1000000 <mycomposite; do true; done
	n=1000; while [ $n -gt 0 ]; do ecm 1000000 <mycomposite && break; n=`expr $n - 1`; done
	*/
     if (feof(stdin)) { r=(r) ? 0 : 1; goto end; }
     ungetc(c,stdin);
     mpz_set_ui(x,0);
   }
   r=1;
 end:
   mpz_clear(p); mpz_clear(s); free(pr); free(prime); clear_all(); return(r);
}

/* we must have B2<2*D*fft_size and phi=phi(2D)/2<fft_size */
int bestD(B2,fft_size,phi) double B2; int *fft_size,*phi;
{
  if (B2<=336.0) { *fft_size=8; *phi=6; return(42/2); }
  else if (B2<=1152.0) { *fft_size=16; *phi=12; return(72/2); }
  else if (B2<=5952.0) { *fft_size=32; *phi=30; return(186/2); }
  else if (B2<=23424.0) { *fft_size=64; *phi=60; return(366/2); }
  else if (B2<=134400.0) { *fft_size=128; *phi=120; return(1050/2); }
  else if (B2<=591360.0) { *fft_size=256; *phi=240; return(2310/2); }
  else if (B2<=2365440.0) { *fft_size=512; *phi=480; return(4620/2); }
  else if (B2<=9461760.0) { *fft_size=1024; *phi=960; return(9240/2); }
  else if (B2<=39137280.0) { *fft_size=2048; *phi=2016; return(9555); }
  else if (B2<=160849920.0) { *fft_size=4096; *phi=3840; return(19635); }
  else if (B2<=648560640.0) { *fft_size=8192; *phi=8064; return(39585); }
  else if (B2<=2611445760.0) { *fft_size=16384; *phi=15840; return(79695); }
  else if (B2<=10824253440.0) { *fft_size=32768; *phi=31680; return(165165); }
  else if (B2<=43751178240.0) { *fft_size=65536; *phi=65280; return(333795); }
  else { printf("Error: too large B2\n"); exit(1); }
}

int best_e(K) int K;
{
   int e;

   if (K==256) e=2;
   else if (K==512) e=3;
   else if (K==1024) e=6;
   else if (K==2048) e=12;
   else if (K==4096) e=30;
   else if (K==8192) e=60;
   else if (K==16384) e=120;
   else e=1;
   return e;
}

double default_B2 (B1) double B1;
{
  double c1,c2,B2,iter,oldB2,e; int K,phi,D;

  c1 = 9.0*B1/log(2.0); /* estimated number of modular mult. for step 1 */
  oldB2 = B2 = 100.0 * B1;
  /* the following gives the expected cost of step 2 with iter=6,
     assuming 2*D/phi(D) is about 2/log(B2/iter) */
  do {
    iter = 12.0;
    D = bestD(B2/iter,&K,&phi);
    while (iter>0 && 2*(iter-1)*K*(double)D>=B2) iter--;
    e = (double) best_e(K);
    c2 = 6.0*e*(2*D/6.0) /* computation of nQx */
       + (28.5+12.0*(iter-1))*pow(K/2.0, log(3.0)/log(2.0)) /* poly. stuff */
       + 6.0*e*iter*K; /* computation of (2D)^m*Q */
    if (c2<c1/2.0) { oldB2=B2; B2*=1.1; }
  } while (c2<c1/2.0);
  B2 = oldB2;
  return B2;
}

/* print (x::z) */
void printout(x,z) mpz_t x,z;
{
   printf("["); mpz_out_str(stdout,10,x); printf(",");
   mpz_out_str(stdout,10,z); printf("]\n");
}

/* factors n and puts the result in p, s is the seed (0 -> random)
   returns 0 iff no factor found
   iter=0 means choice left to the program, otherwise imposed
*/
int ecm(p,n,s,B2,e,iter) mpz_t p,n,s; double B2; int e,iter;
{
   unsigned int st,res,D,K,phi;

   mul=0; gcdexts=0; sizen=mpz_size(n);
   mpz_set_ui(p,6); mpz_gcd(p,n,p); if (mpz_cmp(p,one)) return(1);
   /* now gcd(n,6)=1 */
#ifdef CHECK
   if (mpz_probab_prime_p(n,25))
     printf("******* Warning: probable prime input\n");
#endif
   /* slower in step 1 than usual division for 2,568+ c120 (1.43)
      but faster for 2,671- c145 (1.40)
   */
   if ((base2 && (ispower2 = isbase2(n,1.43)))) {
     printf("recognized factor of 2^%d",(ispower2>0) ? ispower2 : -ispower2);
     if (ispower2>0) {
       mod=mod2plus;
       printf("+"); 
     }
     else {
       mod=mod2minus;
       printf("-");
     }
     printf("1, using special base-2 division\n");
     fflush(stdout);
   }
   else
#ifdef MODMULN
     { mod=mpz_mod_n;
     mpz_set_ui(v,1); mpz_mul_2exp(v,v,mp_bits_per_limb); /* v=2^k */
     mpz_gcdext(z,u,NULL,n,v);
     /* z should be 1 since n is odd and v a power of 2 */
     if (mpz_cmp_ui(z,1)!=0) { printf("gcd(n,R) is not 1\n"); exit(1); }
     mpz_neg(u,u); mpz_mod(u,u,v);
     Nprim=PTR(u)[0]; /* Nprim * n = -1 mod v=2^k, the word base */ }
#else
     mod=(void*)mpz_mod;
#endif
   if (iter==0) iter=12; /* max number of iterations */
   D=bestD(B2/iter,&K,&phi);
   if (e==0) e=best_e(K);
   /* adjust iter if too large */
   while (2*(iter-1)*K*(double)D>=B2) iter--;
   printf("Using B1=%d, B2=%1.0f",B1,2*iter*K*(double)D);
   if (verbose) printf(" (%u*%u*%u)",iter,K,2*D);
   printf(", polynomial x^%u",e);
   if (mpz_cmp_ui(x,0)) /* start from given a and x instead of s */
     mpz_set_ui(z,1);
   else {
   /* generates a random starting point using (11) from [2], or take the 's' given */
   if (mpz_cmp_ui(s,0)) mpz_set(u,s);
   else { /* generate a random sigma */
#if defined (ANSIONLY) || defined (__DJGPP__)
     mpz_set_ui(v,random()); /* thanks to Conrad Curry, generates 31-bits */
#else
     mpz_set_ui(v,lrand48());
#endif
     mpz_mod(u,v,n); 
   }
   printf(", sigma="); mpz_out_str(stdout,10,u);
   mpz_mul_ui(w,u,4); mpz_mod(v,w,n); /* v = (4*s) mod n */
   mpz_mul(x,u,u); mpz_sub_ui(w,x,5); mpz_mod(u,w,n); /* u = (s^2-5) mod n */
   mpz_mul(x,u,u); mpz_mul(w,x,u); mpz_mod(x,w,n); /* x = u^3 mod n */
   mpz_mul(z,v,v); mpz_mul(w,z,v); mpz_mod(z,w,n); /* z:=v^3 mod n */
   mpz_mul(b,x,v); mpz_mul_ui(w,b,4); mpz_mod(b,w,n); /* b = (4*x*v) mod n */
   mpz_sub(a,v,u); mpz_mul(w,a,a); mpz_mul(w,w,a); mpz_mod(w,w,n); /* w = (v-u)^3*/
   mpz_mul_ui(a,u,3); mpz_add(a,a,v); mpz_mul(w,w,a); mpz_mod(a,w,n);
   /* a = ((v-u)^3*(3*u+v)) mod n */
   mpz_gcdext(p,u,NULL,b,n); gcdexts++; /* w = gcd(b,n) = u*b mod n */
   if (mpz_cmp(p,one)) goto youpi;
   mpz_mul(a,a,u); mpz_sub_ui(a,a,2); mpz_mod(a,a,n); /* a = a/b-2 mod n */
 }
   printf("\n"); fflush(stdout);
   if (verbose) {
     printf("A="); mpz_out_str(stdout,10,a); printf("\n"); fflush(stdout);
   }
   mpz_add_ui(b,a,2);
   if (mpz_mod_ui(w,b,2)) mpz_add(b,b,n); mpz_tdiv_q_2exp(b,b,1); /* b = b/2 */
   if (mpz_mod_ui(w,b,2)) mpz_add(b,b,n); mpz_tdiv_q_2exp(b,b,1); /* b = b/2 */
   /* now b = (a+2)/4 mod n */
   mpz_gcdext(p,u,NULL,z,n); gcdexts++; if (mpz_cmp(p,one)) goto youpi;
   mpz_mul(x,x,u); mpz_mod(x,x,n);
   mpz_set_ui(z,1);
   if (verbose) {
     printf("starting point: x="); mpz_out_str(stdout,10,x); printf("\n");
     fflush(stdout);
   }
   /* Step 1 */
   st=cputime();
   res=step1(p);
   printf("Step 1 took %dms for %d muls, %d gcdexts\n",
		      cputime()-st,mul,gcdexts); fflush(stdout);
   if (res) {
     printf("********** Factor found in step 1: "); mpz_out_str(stdout,10,p);
     printf("\n"); fflush(stdout); goto youpi;
   }
   mul=gcdexts=0;
   st=cputime();
   res=step2(p,n,B1,B2,x,a,(int)e,iter);
   printf("Step 2 took %dms for %d muls, %d gcdexts\n",
		      cputime()-st,mul,gcdexts); fflush(stdout);
   if (res) {
     printf("********** Factor found in step 2: "); mpz_out_str(stdout,10,p);
     printf("\n"); fflush(stdout); goto youpi;
   }
   return(0);
 youpi:
   return(1);
}

#ifdef MODMULN
/* multiplies c by R^k modulo n where R=2^mp_bits_per_limb 
   n is supposed odd. Does not need to be efficient. */
void mod_mul2exp(c,n,k) mpz_t c,n; unsigned int k;
{
  mpz_mul_2exp(c,c,k*mp_bits_per_limb);
  mpz_mod(c,c,n);
}

/* divides c by R^k modulo n where R=2^mp_bits_per_limb
   n is supposed odd. Does not need to be efficient. */
void mod_div2exp(c,n,k) mpz_t c,n; unsigned int k;
{
  mpz_t invR,g,R;

  /* first computes the inverse of R mod n */
  mpz_init(invR); mpz_init(g); mpz_init(R);
  mpz_set_ui(R,1); mpz_mul_2exp(R,R,mp_bits_per_limb);
  mpz_gcdext(g,invR,NULL,R,n); /* g = 1 = invR*R mod n */
  while (k-->0) {
    mpz_mul(c,c,invR);
    mpz_mod(c,c,n);
  }
  mpz_clear(invR); mpz_clear(g); mpz_clear(R);
}
#endif

/* returns 0 iff no factor found, otherwise returns factor in p */
int step1(p) mpz_t p;
{
  unsigned int l,i,j,q,imax,lmax,pp;

#ifdef MODMULN
   if (ispower2==0) {
   /* multiply (x,z) by R^sizen */
   mod_mul2exp(x,n,sizen); mod_mul2exp(z,n,sizen);
   mod_mul2exp(b,n,sizen); /* for duplicate */
   _mpz_realloc(x,2*sizen+1); _mpz_realloc(z,2*sizen+1);
   _mpz_realloc(x1,2*sizen+1); _mpz_realloc(z1,2*sizen+1);
   _mpz_realloc(x2,2*sizen+1); _mpz_realloc(z2,2*sizen+1);
   _mpz_realloc(x3,2*sizen+1); _mpz_realloc(z3,2*sizen+1);
   _mpz_realloc(x4,2*sizen+1); _mpz_realloc(z4,2*sizen+1);
   _mpz_realloc(u,2*sizen+1); _mpz_realloc(v,2*sizen+1);
   _mpz_realloc(w,2*sizen+1); }
#endif
  /* treat the cases p=2 and p=3 separately */
  for (q=2;q<=B1;q*=2) duplicate(x,z,x,z);
  for (q=3;q<=B1;q*=3) { duplicate(x1,z1,x,z); add3(x,z,x,z,x1,z1,x,z); }
  lmax = B1/bb;
  for (l=0;l<=lmax;l++) {
    /* check range l*bb <= p < (l+1)*bb */
    if (l) { /* sieve primes, pr[i] corresponds to l*bb+i */
      for (i=0;i<bb;i++) pr[i]='1';
      for (j=1;j<=nbprimes;j++) {
	/* delete multiples of prime[j] */
	q=prime[j];
	i=(q-((l*bb)%q)) % q;
	for(;i<bb;i+=q) pr[i]='0';
      }
    }
    else {
      for (i=0;i<bb;i++) pr[i]='0';
      for (j=3;j<=nbprimes;j++) pr[prime[j]]='1';
    }
    imax = ((B1+1)<(l+1)*bb) ? B1+1-l*bb : bb;
    for (i=0;i<imax;i++)
      if (pr[i]=='1') {
	pp=l*bb+i; for (q=1;q<=B1/pp;q*=pp) prac(pp);
#ifdef GROUPORDER
#ifdef MODMULN
	mpz_set(w,z); if (ispower2==0) mod_div2exp(w,n,sizen); mpz_gcd(p,w,n);
#else
	mpz_gcd(p,n,z);
#endif
	if (mpz_cmp(p,one)) {
	  printf("last prime is %d\n",pp); return(1); }
#endif
      }
  }
#ifdef MODMULN
   /* divide (x,z) by R^sizen before gcd */
   if (ispower2==0) { mod_div2exp(x,n,sizen); mod_div2exp(z,n,sizen); }
#endif
   mpz_gcdext(p,w,NULL,z,n); gcdexts++;
   if (mpz_cmp(p,one)) return(1);
  /* normalizes z to 1 */
  mpz_mul(x,x,w); mpz_mod(x,x,n); mpz_set_ui(z,1);
  return(0);
}

/* initializes tables of primes up to max(sqrt(B),b) */
void initprimes(B,b) double B; int b;
{
  int i,j;

  i = (int)ceil(sqrt(B)+0.5);
  if (i>b) b=i;
  if (b%2) b++; /* ensures b is even for Step 1 */
  if (b<=(int)bb) return; /* already done */
  if (pr != NULL) free(pr);
  pr = (unsigned char*) malloc(b+1);
  /* compute primes up to b */
  for (i=2;i<=b;i++) pr[i]=1;
  j=2; do {
    for (i=j*j;i<=b;i+=j) pr[i]=0;
    while (pr[++j]==0);
  } while (j*j<=b);
  for (nbprimes=0,i=2;i<=b;i++) if (pr[i]!=0) nbprimes++;
  if (prime != NULL) free(prime);
  prime = (unsigned int*) malloc((nbprimes+1)*sizeof(int));
  for (j=0,i=2;i<=b;i++) if (pr[i]!=0) prime[++j]=i;
  bb=b;
}

#define ADD 6 /* number of multiplications in an addition */
#define DUP 5 /* number of multiplications in a duplicate */

/* returns the number of modular multiplications */
unsigned int lucas_cost(n, v) unsigned int n; double v;
{
  unsigned int c,d,e,r;
  d=n; r=(unsigned int)((double)d/v+0.5);
  d=n-r; e=2*r-n; c=DUP+ADD; /* initial duplicate and final addition */
  while (d!=e) {
    if (d<e) { r=d; d=e; e=r; }
    if (4*d<=5*e && ((d+e)%3)==0) { /* condition 1 */
      r=(2*d-e)/3; e=(2*e-d)/3; d=r; c+=3*ADD; /* 3 additions */
    } else
    if (4*d<=5*e && (d-e)%6==0) { /* condition 2 */
      d=(d-e)/2; c+=ADD+DUP; /* one addition, one duplicate */
    } else
    if (d<=(4*e)) { /* condition 3 */
      d-=e; c+=ADD; /* one addition */
    } else
    if ((d+e)%2==0) { /* condition 4 */
      d=(d-e)/2; c+=ADD+DUP; /* one addition, one duplicate */
    } else
    if (d%2==0) { /* condition 5 */
      d/=2; c+=ADD+DUP; /* one addition, one duplicate */
    } else
    if (d%3==0) { /* condition 6 */
      d=d/3-e; c+=3*ADD+DUP; /* three additions, one duplicate */
    } else
    if ((d+e)%3==0) { /* condition 7 */
      d=(d-2*e)/3; c+=3*ADD+DUP; /* three additions, one duplicate */
    } else
    if ((d-e)%3==0) { /* condition 8 */
      d=(d-e)/3; c+=3*ADD+DUP; /* three additions, one duplicate */
    } else 
    if (e%2==0) { /* condition 9 */
      e/=2; c+=ADD+DUP; /* one addition, one duplicate */
    } else
      { printf("no condition qualifies for d=%u e=%u\n",d,e); exit(1); }
  }
  return(c);
}

#define NV 10

/* computes nP from P=(x:z) and puts the result in (x:z). Assumes n>2. */
void prac(n) unsigned int n;
{
   unsigned int d,e,r,i;
   __mpz_struct *xA,*zA,*xB,*zB,*xC,*zC,*xT,*zT,*xT2,*zT2,*t;
   static double v[10] = 
     {1.61803398875,1.72360679775,1.618347119656,1.617914406529,1.612429949509,
    1.632839806089,1.620181980807,1.580178728295,1.617214616534,1.38196601125};
   /* chooses the best value of v */
   for (d=0,r=ADD*n;d<NV;d++) {
     e=lucas_cost(n,v[d]);
     if (e<r) { r=e; i=d; }
   }
   d=n;
   r=(int)((double)d/v[i]+0.5);
   /* A=(x:z) B=(x1:z1) C=(x2:z2) T=T1=(x3:z3) T2=(x4:z4) */
   xA=x; zA=z; xB=x1; zB=z1; xC=x2; zC=z2; xT=x3; zT=z3; xT2=x4; zT2=z4;
   /* first iteration always begins by Condition 3, then a swap */
   d=n-r; e=2*r-n; 
#ifdef COUNTADD
   cond3++;
#endif
   mpz_set(xB,xA); mpz_set(zB,zA); /* B=A */
   mpz_set(xC,xA); mpz_set(zC,zA); /* C=A */
   duplicate(xA,zA,xA,zA); /* A=2*A */
   while (d!=e) {
         if (d<e) { r=d; d=e; e=r; t=xA; xA=xB; xB=t; t=zA; zA=zB; zB=t; }
	 /* do the first line of Table 4 whose condition qualifies */
	 if (4*d<=5*e && ((d+e)%3)==0) { /* condition 1 */
#ifdef COUNTADD
	    cond1++;
#endif
	    r=(2*d-e)/3; e=(2*e-d)/3; d=r;
	    add3(xT,zT,xA,zA,xB,zB,xC,zC); /* T = f(A,B,C) */
	    add3(xT2,zT2,xT,zT,xA,zA,xB,zB); /* T2 = f(T,A,B) */
	    add3(xB,zB,xB,zB,xT,zT,xA,zA); /* B = f(B,T,A) */
	    t=xA; xA=xT2; xT2=t; t=zA; zA=zT2; zT2=t; /* swap A and T2 */
	  } else
	 if (4*d<=5*e && (d-e)%6==0) { /* condition 2 */
#ifdef COUNTADD
	   cond2++;
#endif
	   d=(d-e)/2; 
	   add3(xB,zB,xA,zA,xB,zB,xC,zC); /* B = f(A,B,C) */
	   duplicate(xA,zA,xA,zA); /* A = 2*A */
	 } else
	 if (d<=(4*e)) { /* condition 3 */
#ifdef COUNTADD
	   cond3++;
#endif
	   d-=e; 
	   add3(xT,zT,xB,zB,xA,zA,xC,zC); /* T = f(B,A,C) */
	   t=xB; xB=xT; xT=xC; xC=t;
	   t=zB; zB=zT; zT=zC; zC=t; /* circular permutation (B,T,C) */
	 } else
	 if ((d+e)%2==0) { /* condition 4 */
#ifdef COUNTADD
	   cond4++;
#endif
	   d=(d-e)/2; 
	   add3(xB,zB,xB,zB,xA,zA,xC,zC); /* B = f(B,A,C) */
	   duplicate(xA,zA,xA,zA); /* A = 2*A */
	 } else
	 if (d%2==0) { /* condition 5 */
#ifdef COUNTADD
	   cond5++;
#endif
	   d/=2; 
	   add3(xC,zC,xC,zC,xA,zA,xB,zB); /* C = f(C,A,B) */
	   duplicate(xA,zA,xA,zA); /* A = 2*A */
	 } else
	 if (d%3==0) { /* condition 6 */
#ifdef COUNTADD
	   cond6++;
#endif
	   d=d/3-e; 
	   duplicate(xT,zT,xA,zA); /* T1 = 2*A */
	   add3(xT2,zT2,xA,zA,xB,zB,xC,zC); /* T2 = f(A,B,C) */
	   add3(xA,zA,xT,zT,xA,zA,xA,zA); /* A = f(T1,A,A) */
	   add3(xT,zT,xT,zT,xT2,zT2,xC,zC); /* T1 = f(T1,T2,C) */
	   t=xC; xC=xB; xB=xT; xT=t;
	   t=zC; zC=zB; zB=zT; zT=t; /* circular permutation (C,B,T) */
	 } else
	 if ((d+e)%3==0) { /* condition 7 */
#ifdef COUNTADD
	   cond7++;
#endif
	   d=(d-2*e)/3; 
	   add3(xT,zT,xA,zA,xB,zB,xC,zC); /* T1 = f(A,B,C) */
	   add3(xB,zB,xT,zT,xA,zA,xB,zB); /* B = f(T1,A,B) */
	   duplicate(xT,zT,xA,zA); add3(xA,zA,xA,zA,xT,zT,xA,zA); /* A = 3*A */
	 } else
	 if ((d-e)%3==0) { /* condition 8 */
#ifdef COUNTADD
	   cond8++;
#endif
	   d=(d-e)/3; 
	   add3(xT,zT,xA,zA,xB,zB,xC,zC); /* T1 = f(A,B,C) */
	   add3(xC,zC,xC,zC,xA,zA,xB,zB); /* C = f(A,C,B) */
	   t=xB; xB=xT; xT=t; t=zB; zB=zT; zT=t; /* swap B and T */
	   duplicate(xT,zT,xA,zA);
	   add3(xA,zA,xA,zA,xT,zT,xA,zA); /* A = 3*A */
	 } else
	 if (e%2==0) { /* condition 9 */
#ifdef COUNTADD
	   cond9++;
#endif
	   e/=2; 
	   add3(xC,zC,xC,zC,xB,zB,xA,zA); /* C = f(C,B,A) */
	   duplicate(xB,zB,xB,zB); /* B = 2*B */
	 } else
	 { printf("no condition qualifies for d=%u e=%u\n",d,e); exit(1); }
       }
       add3(xA,zA,xA,zA,xB,zB,xC,zC);
#ifdef DEBUG
   if (d!=1) { printf("d!=1 at the end of PRAC\n"); exit(1); }
#endif
   if (x!=xA) { mpz_set(x,xA); mpz_set(z,zA); }
}

/* adds Q=(x2:z2) and R=(x1:z1) and puts the result in (x3:z3),
     using 5/6 mul, 6 add/sub and 6 mod. One assumes that Q-R=P or R-Q=P where P=(x:z).
     Uses the following global variables:
     - n : number to factor
     - x, z : coordinates of P
     - u, v, w : auxiliary variables
Modifies: x3, z3, u, v, w.
(x3,z3) may be identical to (x2,z2) and to (x,z)
*/
void add3(x3,z3,x2,z2,x1,z1,x,z) mpz_t x3,z3,x2,z2,x1,z1,x,z;
{
adds++;
   mpz_sub(u,x2,z2); mpz_add(v,x1,z1);
   mpz_mul(u,u,v); mod(u,u,n); /* u = ((x2-z2)*(x1+z1)) mod n */
   mpz_add(w,x2,z2); mpz_sub(v,x1,z1);
   mpz_mul(v,w,v); mod(v,v,n); /* v = ((x2+z2)*(x1-z1)) mod n */
   mpz_add(w,u,v); mpz_sub(v,u,v);
   mpz_mul(w,w,w); /* w = (u+v)^2 mod n */
#ifndef NORMALIZE
   mod(w,w,n); 
   mpz_mul(w,w,z); mul++;
#endif
   mpz_mul(v,v,v); mod(v,v,n); /* v = (u-v)^2 mod n */
   mpz_mul(z3,x,v); mod(z3,z3,n); /* z3 = (x*w) mod n */
   mod(w,w,n); mpz_set(x3,w); /* x3 = (z*x3) mod n */
   mul += 5;
}

/* computes 2P=(x2:z2) from P=(x1:z1), with 5 mul, 4 add/sub, 5 mod.
     Uses the following global variables:
     - n : number to factor
     - b : (a+2)/4 mod n
     - u, v, w : auxiliary variables
Modifies: x2, z2, u, v, w
*/
void duplicate(x2,z2,x1,z1) mpz_t x2,z2,x1,z1;
{
#ifdef COUNTADD
dups++;
#endif
   mpz_add(w,x1,z1); mpz_mul(u,w,w); mod(u,u,n); /* u = (x1+z1)^2 mod n */
   mpz_sub(w,x1,z1); mpz_mul(v,w,w); mod(v,v,n); /* v = (x1-z1)^2 mod n */
   mpz_mul(x2,u,v); mod(x2,x2,n); /* x2 = (u*v) mod n */
   mpz_sub(w,u,v); /* w = u-v = 4*x1*z1 */
   mpz_mul(u,b,w); 
   mod(u,u,n);
   mpz_add(u,u,v); /* u = (v+b*w) mod n */
   mpz_mul(z2,w,u); mod(z2,z2,n); /* z2 = (w*u) mod n */
   mul += 5;
}

void duplicate2(x2,z2,x1) mpz_t x2,z2,x1; /* here z1=1 */
{
   mpz_sub_ui(w,x1,1); mpz_mul(v,w,w); mod(v,v,n); /* v = (x1-1)^2 mod n */
   mpz_mul_2exp(w,x1,2);
#ifdef MODMULN
   /* (x1+1)^2 = (x1-1)^2 + 4*x1/R^sizen */
   mod(w,w,n);
#endif
   mpz_add(u,v,w); /* u = (x1+1)^2 mod n */
   mpz_mul(x2,u,v); mod(x2,x2,n); /* x2 = (u*v) mod n */
   mpz_mul(u,b,w);
   mod(u,u,n); mpz_add(u,u,v); /* u = (v+b*w) mod n */
   mpz_mul(z2,w,u); mod(z2,z2,n); /* z2 = (w*u) mod n */
   mul += 4;
}

unsigned int igcd(a,b) unsigned int a,b;
{
  unsigned int t;
  while (b!=0) {
    t=a; a=b; b=t%b;
  }
  return a;
}

/* returns k such that n=2^k */
int lg(n) int n;
{
  int k=0; 
  while (n>1) { 
    if (n%2!=0) { printf("Error: not a power of two\n"); exit(1); } 
    n/=2; k++; }
  return k;
}

/* Step 2: improved standard continuation, cf [2] p. 7-8.
   - p: variable to put factor found
   - n: number to factor
   - B1,B2: bounds for step 1 and 2
   - x: x-coordinate of Q at the end of step 1 (z normalized to 1)
   - a: parameter from the curve b*y^2 = x^3 + a*x^2 + x used in step 1
   Returns 0 iff no factor found, otherwise puts factor in p.
*/
int step2(p,n,B1,B2,x,a,e,iter) 
mpz_t p,n,x,a; unsigned int B1,iter; double B2; int e;
{
   mpz_t *T,**F,*nQx,*G,*lx,*ly,g,y,*tx,*ty; double m;
   int i,j,st,D,K,phi,residue,k; unsigned int res=0,nbit;

   st=cputime();
   if (verbose) {
     printf("x="); mpz_out_str(stdout,10,x); printf("\n"); fflush(stdout);
   }
   /* faster for 2,951- c158 (1.82)
      but slower for 2,749- c123 (1.85) */
   if ((base2 && (ispower2 = isbase2(n,1.84)))) {
     if (ispower2>0) mod=mod2plus; else mod=mod2minus;
   }
   else mod= (mod_t)mpz_mod; /* can't use modmuln here */
   /* determines g,y such that g*y^2 = x^3 + a*x^2 + x */
   mpz_init(y); mpz_set_ui(y,1);
   mpz_init(g); mpz_add(g,x,a); mpz_mul(g,g,x); mpz_add_ui(g,g,1);
   mpz_mul(g,g,x); mod(g,g,n);
   /* change of coordinates x=g*X-a/3, y=g*Y to return to Weierstrass form */
   mpz_mul_ui(u,g,3); mpz_mul(u,u,g); mod(u,u,n);
   mpz_gcdext(p,v,NULL,u,n); if (mpz_cmp_ui(p,1)!=0) return(1); /* v=1/(3g^2)*/
   mpz_mul_ui(x,x,3); mpz_add(x,x,a); mpz_mul(x,x,g);
   mpz_mul(x,x,v); mod(x,x,n); /* x = (x+a/3)/g = g*(3x+a)/(3g^2) */
   mpz_mul_ui(y,y,3); mpz_mul(y,y,g); mpz_mul(y,y,v); mod(y,y,n);
   mpz_mul(a,a,a); mpz_sub_ui(a,a,3); mpz_neg(a,a);
   mpz_mul(a,a,v); mod(a,a,n);
   D=bestD(B2/iter,&K,&phi);
   initprimes(B2,2*D);
   /* with Q the point obtained by Step 1, we compute (6d+1)^e*Q for 0<=d<=D/3,
     (6i+1)^e*Q is stored in nQ[i], and 2DQ is stored in nQ[D/3+1]
     George Woltman noticed we don't need 2*d*Q for D/2 < d < D */
   nQx = (mpz_t*) malloc(K*sizeof(mpz_t));
   for (i=0;i<K;i++) mpz_init(nQx[i]);
   /* G[k] stores (2(k+1)D)^e*Q */
   G = (mpz_t*) malloc((K+1)*sizeof(mpz_t));
   for (k=0;k<=K;k++) mpz_init(G[k]);
   lx = (mpz_t*) malloc(2*(e+1)*sizeof(mpz_t));
   ly = (mpz_t*) malloc(2*(e+1)*sizeof(mpz_t));
   tx = lx+(e+1); ty = ly+(e+1);
   for (j=0;j<2*(e+1);j++) { mpz_init(lx[j]); mpz_init(ly[j]); }
   mpz_set(lx[0],x); mpz_set(ly[0],y); mpz_set(nQx[0],x);
   for (i=1;i<=e;i++) {
     mpz_set(lx[i],x); mpz_set(ly[i],y);
     for (j=0;j<e;j++) 
       { res=multiplyW(p,lx[i],ly[i],lx[i],ly[i],6*i+1,n,a,tx[0],ty[0]);
	 if (res) goto youpi2; }
   }
   for (i=1;i<=e;i++)
     for (j=e;j>=i;j--) {
       res=subW(p,lx[j],ly[j],lx[j],ly[j],lx[j-1],ly[j-1],n,tx[0],ty[0]);
       if (res) goto youpi2; }
   for (i=7,residue=1;i<=2*D;i+=6) {
     res=addWn(p,lx,ly,n,tx,ty,e); if (res) goto youpi2;
     if (igcd(i,D)==1) mpz_set(nQx[residue++],lx[0]);
   }
   if (residue!=phi) { printf("Error: residue!=phi\n"); exit(1); }
   /* fill last places with random numbers */
   for (i=phi;i<K;i++) { mpz_random(nQx[i], sizen); mod(nQx[i],nQx[i],n); }
   if (phi==K) { printf("error: phi=K\n"); exit(1); }
   mpz_set_ui(nQx[K-1], 0);
   F = (mpz_t**) malloc((lg(K)+2)*sizeof(mpz_t*));
   F[0] = nQx;
   T = (mpz_t*) malloc(5*K*sizeof(mpz_t)); /* for Karatsuba */
   for (i=0;i<5*K;i++) mpz_init(T[i]);
   buildF(F,T,K);
   /* now F[i] contains polynomials of degree 2^i */

   /* Q <- (2D)^e*Q */
   for (j=0;j<e;j++) { res=multiplyW(p,x,y,x,y,2*D,n,a,tx[0],ty[0]);
		       if (res) goto youpi2; } 
   mpz_set(lx[0],x); mpz_set(ly[0],y);
   for (i=1;i<=e;i++) {
     mpz_set(lx[i],x); mpz_set(ly[i],y);
     for (j=0;j<e;j++) 
       { res=multiplyW(p,lx[i],ly[i],lx[i],ly[i],i+1,n,a,tx[0],ty[0]);
	 if (res) goto youpi2; }
   }
   for (i=1;i<=e;i++)
     for (j=e;j>=i;j--) {
       res=subW(p,lx[j],ly[j],lx[j],ly[j],lx[j-1],ly[j-1],n,tx[0],ty[0]);
       if (res) goto youpi2; }
   /* now (lx[0],ly[0]) is (2D)^e*Q */
   for (i=0;i<D;i++) pr[i]=1;
   for (nbit=0,m=4.0*D;nbit<iter;nbit++) {
     for (k=0;k<K-1;m+=2.0*(double)D) {
       res = (e==1 && m==4.0*D) /* only case where lx[j]=lx[j+1] */
	 ? duplicateW(p,lx[0],ly[0],lx[0],ly[0],n,a,tx[0],ty[0])
	 : addWn(p,lx,ly,n,tx,ty,e);
       if (res) goto youpi2;
       /* now (lx[0],ly[0]) is m^e*Q */
       if (m+2.0*(double)D>(double)B1) mpz_set(G[k++],lx[0]);
     }
     mpz_set_ui(G[K-1],0);
     buildG(G,T+K,K);
     if (iter>1) { 
       if (nbit==0) { /* copies  into T */
	 for (k=0;k<K;k++) mpz_set(T[k],G[k]);
       }
       else polymulmod(G,F,T,K);
     }
   }
   if (iter>1) for (k=0;k<K;k++) mpz_set(G[k], T[k]);
   fflush(stdout);
#ifdef NAIVE
/* O(n^2) way */
   mpz_set_ui(g,1);
   for (i=0;i<phi;i++)
     for (k=0;k<K;k++) {
       mpz_sub(y,nQx[i],G[k]);
if (mpz_cmp_ui(y,0)==0) printf("match between nQx[%d] and G[%d]\n",i,k);
       mpz_mul(g,g,y); mul++; mod(g,g,n);
     }
#else
   polyeval(G,F,T,K);
   /* now G[0]..G[K-1] contains the values of G(nQx[0])..G(nQx[K-1]) */
   mpz_set(g,G[0]);
   for (i=1;i<K;i++) { mpz_mul(g,g,G[i]); mod(g,g,n); }
#endif
   mpz_gcd(p,g,n); if (mpz_cmp(p,one)) res=1;
 youpi2:
   mpz_clear(g); /* thanks to Paul Leyland */
   mpz_clear(y);
   for (i=0;i<K;i++) mpz_clear(nQx[i]); free(nQx);
   for (k=0;k<=K;k++) mpz_clear(G[k]); free(G);
   for (i=0;i<2*(e+1);i++) { mpz_clear(lx[i]); mpz_clear(ly[i]); }
   free(lx); free(ly);
   for (i=1;i<lg(K)+2;i++) { for (j=0;j<=K;j++) mpz_clear(F[i][j]);
                             free(F[i]); }
   free(F);
   for (i=0;i<5*K;i++) mpz_clear(T[i]); free(T);
   return(res);
 }

void polymulmod(G,F,T,K) mpz_t *G,**F,*T; int K;
{
  int k; unsigned int st;

  st=cputime();
  /* assumes previous remainder is in T[0]..T[K-1] */
  karatsuba(T+K, T, G, K, T+3*K); /* T[K]..T[3K-2] <- T*G */
  /* now reduces mod F using recip(F) = quo(X^(2K),F) */
  for (k=0;k<K;k++) mpz_set(G[k], T[K+k]); 
  for (k=0;k<K-1;k++) mpz_set(T[k], T[2*K+k]); 
  /* now low part of T*G is in G[0..K-1] and high part in T[0..K-2] */
  mpz_set_ui(T[K-1], 0);
  karatsuba(T+K, T, F[lg(K)+1], K, T+3*K);
  mpz_set_ui(T[3*K-1], 0);
  for (k=0;k<K;k++) mpz_add(T[2*K+k], T[2*K+k], T[k]); /* for x^K */
  /* now T[K..3K-1] contains quo(T*G/x^K)*quo(x^(2K)/F) */
  for (k=0;k<K;k++) mpz_set(T[k], T[2*K+k]);
  karatsuba(T+K, T, F[lg(K)], K, T+3*K);
  /* T[K..3K-2] contains F*quotient(T*G,F) */
  for (k=0;k<K;k++) mpz_sub(T[k], G[k], T[K+k]);
  if (verbose) printf("Reducing g*h mod f took %dms\n",cputime()-st);
}

/* Return user CPU time measured in milliseconds. Thanks to Torbjorn. */
#if defined (ANSIONLY) || defined (USG) || defined (__SVR4) || defined (_UNICOS) || defined(__hpux)
#include <time.h>

int
cputime ()
{
  return (int) ((double) clock () * 1000 / CLOCKS_PER_SEC);
}
#else
#include <sys/types.h>
#include <sys/resource.h>

int
cputime ()
{
  struct rusage rus;

  getrusage (0, &rus);
  return rus.ru_utime.tv_sec * 1000 + rus.ru_utime.tv_usec / 1000;
}
#endif

/* returns +/-k if n is a factor of N=2^k+/-1 with N<=n^threshold, 
0 otherwise */
int isbase2(n, threshold) mpz_t n; double threshold;
{
  unsigned int k,lo; int res=0; mpz_t u,w;

  mpz_init(u); mpz_init(w);
  lo=mpz_sizeinbase(n,2)-1;
  mpz_set_ui(u,1); mpz_mul_2exp(u,u,2*lo);
  mpz_mod(w,u,n); /* 2^(2lo) mod n = +/-2^(2lo-k) if m*n = 2^k+/-1 */
  k = mpz_sizeinbase(w,2)-1;
  /* try w = 2^k */
  mpz_set_ui(u,1); mpz_mul_2exp(u,u,k);
  if (mpz_cmp(w,u)==0) res=k-2*lo;
  else {
    /* try w = -2^k */
    mpz_neg(w,w); mpz_mod(w,w,n);
    k = mpz_sizeinbase(w,2)-1;
    mpz_set_ui(u,1); mpz_mul_2exp(u,u,k);
    if (mpz_cmp(w,u)==0) res=2*lo-k;
  }
  mpz_clear(u); mpz_clear(w);
  if (abs(res)>(int)(threshold*lo)) res=0;
  return(res);
}

void mod2plus(a,b,n) mpz_t a,b,n; /* N = 2^ispower2 + 1 */
{
  /* 2^k = -1 */
    mpz_tdiv_r_2exp(y,b,ispower2);
    mpz_tdiv_q_2exp(a,b,ispower2);
    mpz_sub(a,y,a);
    mpz_mod(a,a,n);
}

void mod2minus(a,b,n) mpz_t a,b,n; /* N = 2^k - 1, ispower2<0 */
{
  /* 2^k = 1 */
    mpz_tdiv_r_2exp(y,b,-ispower2);
    mpz_tdiv_q_2exp(a,b,-ispower2);
    mpz_add(a,y,a);
    mpz_mod(a,a,n);
}

#ifdef MODMULN
static /* inline: not ANSI */ void
mpn_incr (mp_ptr p, mp_limb_t incr)
{
  mp_limb_t x;

  x = *p + incr;
  *p++ = x;
  if (x >= incr)
    return;
  while (++(*(p++)) == 0)
    ;
}

/* Computes c/R^nn mod n, where n are nn limbs
   and c has space for size(c)+1 limbs.  n must be odd.
*/
void mpz_mod_n (c, a, n) mpz_t c,a,n;
{
  mp_ptr cp=PTR(c), np=PTR(n);
  mp_limb_t cy;
  mp_limb_t q;
  size_t j,nn=sizen;

#ifdef DEBUG
if (ALLOC(c)<2*nn+1) { printf("ALLOC(c)<2*nn+1\n"); exit(1); }
if (mpz_size(c)>2*nn) { printf("mpz_size(c)>2*nn\n"); exit(1); }
#endif
  for (j=ABS(SIZ(c));j<=2*nn;j++) cp[j] = 0;
  for (j = 0; j < nn; j++)
    {
      q = cp[j] * Nprim;
      cy = mpn_addmul_1 (cp + j, np, nn, q);
      mpn_incr (cp + nn + j, cy);
    }
  if (cp[2*nn] != 0) {
    cy = cp[2*nn] - mpn_sub_n (cp, cp + nn, np, nn);
    while (cy) cy -= mpn_sub_n (cp, cp, np, nn);
  }
  else MPN_COPY (cp, cp + nn, nn);
  MPN_NORMALIZE (cp, nn);
  SIZ(c) = SIZ(c) < 0 ? -nn : nn;
}
#endif

int multiplyW(p,x1,y1,x,y,q,n,a,u,v) mpz_t p,x1,y1,x,y,n,a,u,v; unsigned int q;
{
  unsigned int j,r,restore; mpz_t x2,y2;
  restore=(x1==x);
  if (restore) { mpz_init(x2); mpz_init(y2); x1=x2; y1=y2; }
  for (r=q,j=1;r!=1;r/=2,j<<=1);
  j >>= 1; 
  r=duplicateW(p,x1,y1,x,y,n,a,u,v);
  if (r) return(r);
  if (q&j) r=addW(p,x1,y1,x1,y1,x,y,n,u,v);
  if (r) return(r);
  j >>= 1;
  while (j!=0) {
    if (duplicateW(p,x1,y1,x1,y1,n,a,u,v)) return(1);
    if (q&j) if (addW(p,x1,y1,x1,y1,x,y,n,u,v)) return(1);
    j >>= 1;
  }
  if (restore) {   mpz_set(x,x1); mpz_set(y,y1);
		   mpz_clear(x2); mpz_clear(y2); }
  return(0);
}

/* (x,y) can be identical to (x1,y1) */
int duplicateW(p,x1,y1,x,y,n,a,u,v) mpz_t p,x1,y1,x,y,n,a,u,v;
{
  mpz_mul_ui(u,y,2);
  mpz_gcdext(p,v,NULL,u,n);
  if (mpz_cmp_ui(p,1)!=0) return(1);
  mpz_mul(u,x,x); mpz_mul_ui(u,u,3); mpz_add(u,u,a); mod(u,u,n);
  mpz_mul(p,u,v); mod(p,p,n);
  mpz_mul(u,p,p); mpz_mul_ui(v,x,2); mpz_sub(u,u,v); mod(u,u,n);
  mpz_sub(v,x,u); mpz_mul(v,v,p); mpz_sub(y1,v,y); mod(y1,y1,n);
  mpz_set(x1,u);
  mul+=4; gcdexts++;
  return(0);
}

/* performs the following loop with only one gcdext, using Montgomery's trick:
   for (j=0;j<e;j++) {
       res=addW(p,x[j],y[j],x[j],y[j],x[j+1],y[j+1],n,u[0],v[0]);
       if (res) return(1); }
   return(0);

   Uses one inversion and 6*e multiplications for e>1 (3 muls for e=1)
*/
int addWn(p,x,y,n,u,v,e) mpz_t p,*x,*y,n,*u,*v; int e;
{
  int j;
  mpz_sub(u[e-1],x[e],x[e-1]); mpz_set(v[e-1],u[e-1]);
  for (j=e-2;j>=0;j--) {
    mpz_sub(u[j],x[j+1],x[j]);
    mpz_mul(v[j],u[j],v[j+1]); /* v[j] = u[j]*u[j+1]*...*u[e-1] */
    mod(v[j],v[j],n);
    mul++;
  }
  mpz_gcdext(p,v[e],NULL,v[0],n); if (mpz_cmp_ui(p,1)!=0) return(1);
  gcdexts++;
  for (j=0;j<e;j++) {
    /* loop invariant: v[e] = 1/(u[j]*u[j+1]*...*u[e-1]) */
    if (j!=e-1) {
      mpz_mul(v[j+1],v[j+1],v[e]); mod(v[j+1],v[j+1],n);
      /* restore v[e] for next loop and make u[j] free */
      mpz_mul(v[e],v[e],u[j]); mod(v[e],v[e],n); mul+=2; }
    /* now v[j+1] = 1/(x[j+1]-x[j]) mod n */
    mpz_sub(p,y[j+1],y[j]); mpz_mul(p,v[j+1],p); mod(p,p,n);
    mpz_mul(u[j],p,p); mpz_sub(u[j],u[j],x[j]); 
    mpz_sub(x[j],u[j],x[j+1]); mod(x[j],x[j],n);
    mpz_sub(u[j],x[j+1],x[j]); mpz_mul(u[j],u[j],p); 
    mpz_sub(y[j],u[j],y[j+1]); mod(y[j],y[j],n);
    mul+=3;
  }
  return(0);
}

int addW(p,x,y,x1,y1,x2,y2,n,u,v) mpz_t p,x,y,x1,y1,x2,y2,n,u,v;
{
  mpz_sub(u,x2,x1);
  mpz_gcdext(p,v,NULL,u,n); if (mpz_cmp_ui(p,1)!=0) return(1);
  mpz_sub(p,y2,y1); mpz_mul(p,v,p); mod(p,p,n);
  mpz_mul(u,p,p); mpz_sub(u,u,x1); mpz_sub(v,u,x2); mod(v,v,n);
  mpz_sub(u,x1,v); mpz_mul(u,u,p); mpz_sub(y,u,y1); mod(y,y,n);
  mpz_set(x,v);
  mul+=3; gcdexts++;
  return(0);
}

/* (x,y) can be identical to (x1,y1) */
int subW(p,x,y,x1,y1,x2,y2,n,u,v) mpz_t p,x,y,x1,y1,x2,y2,n,u,v;
{
  mpz_sub(u,x1,x2);
  mpz_gcdext(p,v,NULL,u,n); if (mpz_cmp_ui(p,1)!=0) return(1);
  mpz_add(p,y1,y2); mpz_mul(p,v,p); mod(p,p,n);
  mpz_mul(u,p,p); mpz_sub(u,u,x1); mpz_sub(v,u,x2); mod(v,v,n);
  mpz_sub(u,x1,v); mpz_mul(u,u,p); mpz_sub(y,u,y1); mod(y,y,n);
  mpz_set(x,v);
  mul+=3; gcdexts++;
  return(0);
}

/* assuming a[0]..a[n-1] are in F[0]..F[n-1], for n a power of two,
   put the coefficients of products of 2^d consecutive x-a[i] in F[d]
   for d=1..lg(n), and puts recip((x-a[0])..(x-a[n-1])) in F[lg(n)+1] */
void buildF(F,T,n) mpz_t **F,*T; unsigned int n;
{
   unsigned int d,D,st,st1; int i;

   st=cputime(); d=0; D=1; while (D<n) {
      F[d+1]=(mpz_t*) malloc((n+1)*sizeof(mpz_t));
      for (i=0;i<=n;i++) mpz_init(F[d+1][i]);
      st1=cputime();
      for (i=0;i<n;i+=2*D) polymul(F[d+1]+i,F[d]+i,F[d]+i+D,D,T);
      if (verbose && D==n/2)
	printf("Product of two polynomials of degree %d took %dms\n",D,cputime()-st1);
      D=2*D; d++; }
   if (verbose) {
     printf("Building f from its roots took %dms\n",cputime()-st); 
     fflush(stdout);
   }
   /* puts recip(F[d]) in F[d+1] */
   F[d+1] = (mpz_t*) malloc((n+1)*sizeof(mpz_t));
   for (i=0;i<=n;i++) mpz_init(F[d+1][i]);
   st=cputime();
   recip(F[d+1], F[d], T, n);
   if (verbose) printf("Inverting f took %dms\n",cputime()-st);
}

/* puts in G the coefficients from (x-G[0])...(x-G[n-1])
   using 2*n cells in T */
void buildG(G,T,n) mpz_t *G,*T; unsigned int n;
{
   unsigned int st,d,D; int i;

   st=cputime();
   d=0; D=1; while (D<n) {
     if (d%2==0)
       for (i=0;i<n;i+=2*D) polymul(T+i,G+i,G+i+D,D,T+n);
     else
       for (i=0;i<n;i+=2*D) polymul(G+i,T+i,T+i+D,D,T+n);
     d++; D *= 2;
   }
   if (d%2) for (i=0;i<n;i++) mpz_set(G[i],T[i]);
   for (i=0;i<n-1;i++) mpz_set(G[i], G[i+1]); mpz_set_ui(G[n-1], 1);
   if (verbose) printf("Building g from its roots took %dms\n",cputime()-st);
}

/* algorithm POLYEVAL from section 3.7 of Peter Montgomery's dissertation.
Input: 
n - a power of two
G - a table (or array) of elements of R, G[i], 0<=i<n-1
a - a table (or array) of elements of R, a[i], 0<=i<n
Output: the sequence of values of G(a[i]) are stored in G[i] for 0<=i<n
*/
void polyeval(G,F,T,n) mpz_t *G,**F,*T; unsigned int n;
{
   mpz_t *q; unsigned int d,D,st,j; int i;

   st=cputime();
   D = n/2; d=lg(n)-1; while (D>=1) {
     for (i=n-2*D;i>=0;i-=2*D) {
       q = F[d+2]+D+i; /* recip(F[d+1]+i)/X^D */
       polymul(F[d+1]+i, F[d]+i, q, D, T);
       /* highest part is already in F[d+1]+D+i */
       if (i) polymul(F[d+1]+i-D, F[d]+D+i, q, D, T);
       else { /* special case for i=0 */
	 polymul(T, F[d]+D, q, D, T+2*D);
	 for (j=0;j<D;j++) mpz_set(F[d+1][j], T[D+j]);
       }
       q = G+D+i;
       polymul1(T+D, F[d+1]+i, q, D, T+3*D); /* h = T+2*D */
       polymul1(T, F[d]+i, T+2*D, D, T+4*D); /* T = h * F[d]+i */
       for (j=0;j<D;j++) mpz_sub(T[j], G[i+j], T[j]);
       polymul1(T+2*D, F[d+1]+D+i, q, D, T+4*D); /* h = T+3*D */
       polymul1(T+D, F[d]+D+i, T+3*D, D, T+5*D); /* T+D = h * F[d]+D+i */
       for (j=0;j<D;j++) mpz_sub(G[D+i+j], G[i+j], T[D+j]);
       for (j=0;j<D;j++) mpz_set(G[i+j], T[j]);
     }
     D /= 2; d--;
   } 
   if (verbose) printf("Evaluating g on roots of f took %dms\n",cputime()-st);
}

/* multiplies b[0]+b[1]*x+...+b[K-1]*x^(K-1) by c[0]+c[1]*x+...+c[K-1]*x^(K-1)
     and puts the result in a[0]+a[1]*x+...+a[2*K-2]*x^(2K-2)
     where K is a power of two. 
     t is an auxiliary storage of at least K coefficients.
*/
void karatsuba(a,b,c,K,t) mpz_t *a,*b,*c,*t; unsigned int K;
{
   if (K==1) { mpz_mul(a[0],b[0],c[0]); mod(a[0],a[0],n); mul++; }
   else { int i,k=K/2;
      for (i=0;i<k;i++) { 
         mpz_add(t[i],b[i],b[k+i]); mpz_add(t[k+i],c[i],c[k+i]); 
       }
      karatsuba(t+K,t,t+k,k,a); /* puts (b0+b1)*(c0+c1) in t[K..2K-2] */
      karatsuba(a,b,c,k,t); /* puts b0*c0 in a[0..K-2] */
      mpz_set_ui(a[K-1],0);
      karatsuba(a+K,b+k,c+k,k,t); /* puts b1*c1 in a[K..2K-2] */
      /* a[K-1] = a[2K-1] = t[2K-1] = 0 */
      for (i=0;i<K-1;i++) {
         mpz_sub(t[K+i],t[K+i],a[i]); mpz_sub(t[K+i],t[K+i],a[K+i]);
      }
      for (i=0;i<K-1;i++) mpz_add(a[k+i],a[k+i],t[K+i]);
   }
}

/* same as karatsuba(a,b,b,K,t) but should be faster */
void karasqr(a,b,K,t) mpz_t *a,*b,*t; unsigned int K;
{
   if (K==1) { mpz_mul(a[0],b[0],b[0]); mod(a[0],a[0],n); mul++; }
   else { int i,k=K/2;
      for (i=0;i<k;i++) mpz_add(t[i],b[i],b[k+i]); 
      karasqr(t+K,t,k,a); /* puts (b0+b1)^2 in t[K..2K-2] */
      karasqr(a,b,k,t); /* puts b0^2 in a[0..K-2] */
      mpz_set_ui(a[K-1],0);
      karasqr(a+K,b+k,k,t); /* puts b1^2 in a[K..2K-2] */
      /* a[K-1] = a[2K-1] = t[2K-1] = 0 */
      for (i=0;i<K-1;i++) {
         mpz_sub(t[K+i],t[K+i],a[i]); mpz_sub(t[K+i],t[K+i],a[K+i]);
      }
      for (i=0;i<K-1;i++) mpz_add(a[k+i],a[k+i],t[K+i]);
   }
}

/* multiplies b[0]+...+b[k-1]*x^(k-1)+x^k by c[0]+...+c[k-1]*x^(k-1)+x^k */
void polymul(a,b,c,k,t) mpz_t *a,*b,*c,*t; unsigned int k;
{
  unsigned int i;
  karatsuba(a,b,c,k,t);
  for (i=k;i<2*k-1;i++) {
    mpz_add(a[i],a[i],b[i-k]); mpz_add(a[i],a[i],c[i-k]);
  }
  mpz_add(a[2*k-1],b[k-1],c[k-1]);
}

/* multiplies b[0]+...+b[k-1]*x^(k-1)+x^k by c[0]+...+c[k-1]*x^(k-1) */
void polymul1(a,b,c,k,t) mpz_t *a,*b,*c,*t; unsigned int k;
{
  unsigned int i;
  karatsuba(a,b,c,k,t);
  for (i=k;i<2*k-1;i++) mpz_add(a[i],a[i],c[i-k]);
  mpz_set(a[2*k-1],c[k-1]);
}

/* O(K^2) mult. of b[0]+...+b[k-1]*x^(k-1) by c[0]+...+c[k-1]*x^(k-1) */
void naivemul(a,b,c,K,t) mpz_t *a,*b,*c,*t; unsigned int K;
{
  int i,j;

  for (i=0;i<K;i++) {
    /* a[i] = b[i]*c[0] + ... + b[0]*c[i] */
    mpz_mul(a[i],b[i],c[0]);
    for (j=1;j<=i;j++) { mpz_mul(t[0],b[i-j],c[j]); mpz_add(a[i],a[i],t[0]); }
    mod(a[i],a[i],n); /* performs only one mod per a[i] */
  }
  for (i=K;i<2*K-1;i++) {
    /* a[i] = b[K-1]*c[i-K+1] + ... + b[i-K+1]*c[K-1] */
    mpz_mul(a[i],b[K-1],c[i-K+1]);
    for (j=i-K+2;j<K;j++) {mpz_mul(t[0],b[i-j],c[j]); mpz_add(a[i],a[i],t[0]);}
    mod(a[i],a[i],n); /* performs only one mod per a[i] */
  }
  mul += K*K;
}

/* puts f(x) in p where f(x)=f[0]+f[1]*x+...+f[k-1]*x^(k-1)+x^k */
void horner(p,f,x,k) mpz_t p,*f,x; unsigned int k;
{
  int i;
  mpz_set_ui(p,1);
  for (i=k-1;i>=0;i--) {
    mpz_mul(p,p,x); mpz_add(p,p,f[i]); mod(p,p,n);
  }
  mul += k;
}

void printpol(f,k,flag) mpz_t *f; unsigned int k,flag;
{
  int i;
  for (i=0;i<k;i++) {
    if (i>0 && mpz_cmp_ui(f[i],0)>=0) printf("+");
    mpz_out_str(stdout,10,f[i]);
    if (i>0) { printf("*x"); if (i>1) printf("^%d",i); }
  }
  if (flag) printf("+x^%d",k);
  printf("\n");
}

/* compute RECIP(F) = divide(X^(2*m),F,Quo) where m=deg(F) is a power of 2
>> F:=poly(x^4+x^3+x^2+1): recip(F);

                                  3    4
                        poly(x - x  + x  - 2, [x])

The result is put in R, T is an auxiliary storage of size 5*m.
Here fn=lc(F)=1.

Cost is 3*(M(m/2)+M(m/4)+...), i.e. 9/2*M(m/2) for Karatsuba.
Assumes coeff(F,0)=0, i.e. coeff(R,0) is not computed.
*/
void recip(R,F,T,m) mpz_t *R,*F,*T; unsigned int m;
{
   unsigned int k,j,hi;

   hi=m; /* R_k of degree k-1 in stored in R[hi]..R[hi+k-1] */
   mpz_set_ui(R[hi],1); /* R_1(X)=1 */
   mpz_set_ui(F[m], 1);
   for (k=2;k<=m;k*=2) {
      /* stores head(F,2*k)*R_{k/2} in T[0..2*k-1] */
      karatsuba(T, F+m-k+1, R+hi, k/2, T+k);
      mpz_set_ui(T[k-1], 0);
      karatsuba(T+k, F+m-k/2+1, R+hi, k/2, T+2*k);
      /* only the coefficients of degree k/2-1 to k-2 are needed */
      mpz_set(T[0], T[k/2-1]); /* coeff of degree k/2-1 */
      for(j=k/2;j<k-1;j++) mpz_add(T[1+j-k/2], T[j], T[k/2+j]);
      /* now multiplies by R_{k/2} */
      karatsuba(T+2*k, T, R+hi, k/2, T+3*k);
      /* the upper coefficients from k/2 to k-1 of R are unchanged */
      /* the lower coefficients from 0 to k/2-1 come directly from h */
      hi -= k/2;
      for (j=0;j<k/2;j++) mpz_neg(R[hi+j],T[2*k+k/2+j-1]);
   }
   mpz_set_ui(R[0],0);
}
