# 任意模数 NTT 原理

### 算法原理

\begin{aligned} F_i & =A(\omega_n^i)=\sum_{k=0}^{n-1}a_k\omega_n^{ki}\\ G_i & =B(\omega_n^i)=\sum_{k=0}^{n-1}b_k\omega_n^{ki} \end{aligned}

\begin{aligned} dft(A+iB) _ k & =\sum _ {i=0}^{n-1}(a_i+ib_i)\omega _ n^{ki}=A(\omega _ n^k)+iB(\omega _ n^k)\\ dft(A-iB) _ k & =\sum _ {i=0}^{n-1}(a_i-ib_i)\omega _ n^{ki}=A(\omega _ n^k)-iB(\omega _ n^k)=conj(dft(A+iB) _ {n-k}) \end{aligned}

\begin{aligned} dft(A)_k & =\frac{dft(A+iB)+dft(A-iB)}{2}\\ \ dft(B)_k & =\frac{dft(A+iB)-dft(A-iB)}{2i} \end{aligned}

\begin{aligned} dft(A+iB) _ {n-k} & =\sum _ {i=0}^{n-1}(a _ i+ib _ i)\omega _ n^{-ki}\\ & =\sum _ {i=0}^{n-1}(a _ i+ib _ i)(\cos \theta-i\sin \theta)\\ & =\sum _ {i=0}^{n-1}(a _ i\cos\theta+b _ i\sin\theta)-i(a _ i\sin\theta-b _ i\cos\theta)\\ & =\sum _ {i=0}^{n-1}conj((a _ i\cos\theta+b _ i\sin\theta)+i(a _ i\sin\theta-b _ i\cos\theta))\\ & =\sum _ {i=0}^{n-1}conj((a _ i-ib _ i)(cos\theta+i\sin\theta))\\ & =\sum _ {i=0}^{n-1}conj((a _ i-ib _ i)\omega _ n^{ki})\\ & =conj(dft(A-iB) _ k) \end{aligned}

### 算法实现

// luogu-judger-enable-o2
#include <bits/stdc++.h>
#define ML 262149
#define pi (acos(-1))
#define M 32768ll

using namespace std;

typedef long long ll;

typedef long double ldb;

{
x = 0; char c = getchar();
while(!isdigit(c)) c = getchar();
while(isdigit(c)) x = x*10+c-'0', c = getchar();
}

ll mod;

namespace fft
{
struct Z
{
ldb r, i;

Z (const ldb &r0 = 0, const ldb &i0 = 0) : r(r0), i(i0) {}
Z operator + (const Z& t) const {return Z(r+t.r, i+t.i);}
Z operator - (const Z& t) const {return Z(r-t.r, i-t.i);}
Z operator * (const Z& t) const {return Z(r*t.r-i*t.i, r*t.i+i*t.r);}
Z conj() const {return Z(r, -i);}
void operator /= (const ldb& t) {r /= t, i /= t;}
};

struct Fourier
{
int n, bit, rev[ML];

void init(int x)
{
n = 1, bit = 0;
while(n < x) n <<= 1, bit++;
for(int i=1; i<n; i++) rev[i] = (rev[i>>1]>>1) | ((i&1)<<(bit-1));
}

void dft(Z *x, int f)
{
for(int i=0; i<n; i++)
if(i < rev[i])
swap(x[i], x[rev[i]]);
for(int w=1; w<n; w<<=1)
{
for(int i=0; i<n; i+=(w<<1))
{
for(int j=0; j<w; j++)
{
Z a = x[i+j], b = x[i+j+w] * Z(cos(pi/w*j), f*sin(pi/w*j));;
x[i+j] = a + b;
x[i+j+w] = a - b;
}
}
}
if(f == -1) for(int i=0; i<n; i++) x[i] /= n;
}
} F;

Z Xq[ML], Yq[ML], xlyl[ML], xlyh[ML], xhyl[ML], xhyh[ML];

void fast_multiply(ll *x, ll *y, ll *ret)
{
for(int i=0; i<F.n; i++)
Xq[i] = Z(x[i]>>15, x[i]&((1<<15)-1)),
Yq[i] = Z(y[i]>>15, y[i]&((1<<15)-1));
F.dft(Xq, +1), F.dft(Yq, +1);
for(int i=0; i<F.n; i++)
{
int j = (F.n-i) & (F.n-1);
Z xh = (Xq[i]+Xq[j].conj()) * Z(0.5, 0);
Z xl = (Xq[i]-Xq[j].conj()) * Z(0, -0.5);
Z yh = (Yq[i]+Yq[j].conj()) * Z(0.5, 0);
Z yl = (Yq[i]-Yq[j].conj()) * Z(0, -0.5);
xhyh[j] = xh*yh, xhyl[j] = xh*yl, xlyh[j] = xl*yh, xlyl[j] = xl*yl;
}
for(int i=0; i<F.n; i++)
Xq[i] = xhyh[i] + xhyl[i] * Z(0, 1),
Yq[i] = xlyh[i] + xlyl[i] * Z(0, 1);
F.dft(Xq, +1), F.dft(Yq, +1);
for(int i=0; i<F.n; i++)
{
ll xhyh = ll(Xq[i].r/F.n + 0.5) % mod;
ll xhyl = ll(Xq[i].i/F.n + 0.5) % mod;
ll xlyh = ll(Yq[i].r/F.n + 0.5) % mod;
ll xlyl = ll(Yq[i].i/F.n + 0.5) % mod;
ret[i] = ((xhyh<<30) + (xhyl<<15) + (xlyh<<15) + (xlyl)) % mod;
}
}
}

ll a[ML], b[ML], c[ML], n, m;

int main()
{