Location>code7788 >text

String Learning Notes

Popularity:761 ℃/2024-08-21 18:22:41

Extended kmp

Let z[i] represent the longest common prefix between the string after i and the original string

r is the current maximal position of get and l is the corresponding left endpoint

It's an obvious state transfer.

For example, now that we've enumerated to the position i

i in the range [l,r], first S[l,r]==S[1,r-l+1]

Thus S[i,r]==S[i-l+1,r-l+1]

Then obviously z[i]=min(z[i-l+1],r-i+1) can't exceed length

Suppose z[i-l+1]+(i-l+1)-1=r-l+1, that is, z[i]+i-1=r

But a match at r+1 is still possible because Sr+1!=Sr-l+2 but not necessarily not equal to the prefix Sz[i]+1

Then just do it violently, since z[i]++ must make r bigger.

So it is increasing while loop is executed at most n times (execution of while loop assumes z[i]+i-1=r)

Note that 1 can't go into a loop, otherwise it's always z[i]=z[i] and every i executes a while loop, rendering the algorithm invalid.

void getz()
{
    z[1]=n;
    int l=0,r=0;
    for(int i=2;i<=n;i++){
        if(i<=r)z[i]=min(z[i-l+1],r-i+1);
        while(i+z[i]<=n&&b[i+z[i]]==b[z[i]+1])z[i]++;
        if(i+z[i]-1>r)r=i+z[i]-1,l=i;
    }
}

AC automatic machine

fail[i] represents the mismatch pointer of the node i, i.e., the longest common suffix (which cannot be itself)

First build the trie tree, then bfs build the automaton, note that in order to prevent themselves from matching themselves, the first level push into the queue can be

void insert(int i){
	string x;
	cin>>x;
	s[i]=x;
	int now=0;
	for(auto j:x){
		if(!tr[now][j-'a'])
			tr[now][j-'a']=++cnt;
		now=tr[now][j-'a'];
	}
	mp[i]=now;
}
void build(){
	queue<int>q;
	for(int i=0;i<26;i++)if(tr[0][i])(tr[0][i]);
	while(!()){
		int now=();();
		for(int i=0;i<26;i++){
			if(tr[now][i]){
				fail[tr[now][i]]=tr[fail[now]][i];
				(tr[now][i]);
			}
			else tr[now][i]=tr[fail[now]][i];
		}
	}
}

palindrome

Since the echo string ending in s[i] must be transferred from the echo string ending in s[i-1]

So build the palindrome tree

getfail(x,i) means to find the previous subscript in the palindrome with subscript x, and the previous subscript at the beginning is equal to s[i].

i.e. the longest palindrome suffix

Why is the dot inserted after the longest palindrome suffix :

Because if it's not the longest palindrome suffix, then it must have appeared. The nature of palindromes in palindromes.

0 for even numbered returns 1 for odd numbered returns

len[1]=-1 --> guarantees correct +=2 result every time

Fail pointer solving:

Since the longest echo suffix can't be itself so apply fail[pos] to solve for it instead of pos (since it would return pos directly resulting in return itself) and transfer it directly based on the getfail function

Time Complexity Proof:

Since each insertion makes the depth +1, the depth is added at most n times, the while loop is executed at most n times

ps: the second one is a bit problematic, but just use it, no one is proving it!

int getfail(int x,int i){
    while(s[i-len[x]-1]!=s[i])x=fail[x];
    return x;
}
void solve()
{
    string s;
    cin>>s;
    int n=(),pre=0;
    s=" "+s;
    len[1]=-1,fail[0]=1;
    for(int i=1;i<=n;i++){
        int pos=getfail(pre,i);
        if(!tr[pos][s[i]-'a']){
            fail[++tot]=tr[getfail(fail[pos],i)][s[i]-'a'];
            tr[pos][s[i]-'a']=tot;
            len[tot]=len[pos]+2;
            cnt[tot]=cnt[fail[tot]]+1;
        }
        pre=tr[pos][s[i]-'a'];
    }
}

half[i] is the longest i-terminated palindrome suffix with length less than or equal to len[i]/2.

for(int i=1;i<=n;i++){
    int pos=getfail(pre,i);
    if(!tr[pos][s[i]-'a']){
        fail[++tot]=tr[getfail(fail[pos],i)][s[i]-'a'];
        tr[pos][s[i]-'a']=tot;
        len[tot]=len[pos]+2;
        if(len[tot]<=2)half[tot]=fail[tot];
        else {
        	int tmp=half[pos];
            while(s[i-1-len[tmp]]!=s[i]||2*len[tmp]+4>len[tot]){
                tmp=fail[tmp];
            }
            half[tot]=tr[tmp][s[i]-'a'];
        }
    }
    pre=tr[pos][s[i]-'a'];
}

suffix array

See code for specific details

void get_sa(){
    int m=130;//m is the max number of the bucket
    vector<int>sa(n+1,0),sa2(2*n+1,0),rk(2*n+1,0),c(max(n,m)+1,0);
    /*
        sa[i] represents the starting number of the suffix ranked i
        sa2[i] represents the starting number of the suffix ranked i after the second keyword sorting
        rk[i] represents the ranking of suffix with i as the starting suffix
        c[i] is the number of buckets to store the first keyword.
    */
    for(int i=1;i<=n;i++)c[rk[i]=s[i]]++;
    // At the beginning the first keyword is s[i] just deposit it into the bucket
    for(int i=1;i<=m;i++)c[i]+=c[i-1]; //do the prefix sum.
    //Do a prefix sum so that you can find the maximum rank for each first keyword.
    for(int i=n;i>=1;i--)sa[c[rk[i]]--]=i;
    // Sort by first keyword
    for(int k=1;k<=n;k<<=1){
        int num=0;
        for(int i=n-k+1;i<=n;i++)sa2[++num]=i;
        // Since there is no second keyword for n-k+1~n it is ranked the highest
        for(int i=1;i<=n;i++)if(sa[i]>k)sa2[++num]=sa[i]-k;
        //Iterate through sa[i] from smallest to largest to ensure that the ranking is from front to back.
        //If sa[i]>k means this can be used as the second keyword, then it is stored in sa2.
        for(int i=1;i<=m;i++)c[i]=0;
        //initialize bucket
        for(int i=1;i<=n;i++)c[rk[i]]++;
        for(int i=1;i<=m;i++)c[i]+=c[i-1];
        //Store the first keyword and do a prefix sum
        for(int i=n;i>=1;i--)sa[c[rk[sa2[i]]]--]=sa2[i];
        //Traversing backwards ensures that the sorting is from back to front
        //rk[sa2[i]] is the first keyword with the second keyword ranked as a suffix to i
        //it could be like above, since the bucket prefix and the first keyword are already sorted.
        //so it's essentially iterating backwards, where each first keyword doesn't matter, it's all in the same interval.
        // and then sort the second keyword backwards.
        swap(rk,sa2);//The next step is to update the rk array, so do a temporary swap.
        rk[sa[1]]=1;//obviously the rk of the suffix ranked 1 is 1
        num=1;
        for(int i=2;i<=n;i++){
            rk[sa[i]]=(sa2[sa[i]]==sa2[sa[i-1]]&&sa2[sa[i]+k]==sa2[sa[i-1]+k])?num:++num;
            //If the first and second keywords are both the same as the previous one, the rankings are the same and num does not need to be ++
        }
        if(num==n)break;
        //num==n means all the suffixes are not ranked the same, end of sorting
        m=num;//update the maximum value of the container.
    }
    for(int i=1;i<=n;i++)cout<<sa[i]<<' ';;
}

Solving for height arrays

void get_height(){
    //height[i] represents LCP(sa[i],sa[i-1])
    height[1]=0;
    int k=0;
    //k represents the length that is now matched to
    for(int i=1;i<=n;i++){
      //traversal by suffix It's easy to know that height[rk[i]]>=height[rk[i-1]]-1
        int j=sa[rk[i]-1];// starting subscript of the suffix of the previous ranking
        if(k)k--;//-1 because the first character lost
        while(s[i+k]==s[j+k])k++;//violent matching k max is n total time complexity max is O(n)
        height[rk[i]]=k;//k is the longest common prefix
    }
}

Suffix array template

struct SA{
    vector<int>sa,sa2,c,rk,height;
    vector<vector<int>>dp;
    void init(string s,int n){
        int m=150;
        sa=vector<int>(max(n,m)+1,0);
        c=height=sa;
        sa2=rk=vector<int>(n*2+1,0);
        //likelihoodrepoint of contact (math.)
        dp=vector<vector<int>>(n+1,vector<int>(31,1e9+10));
        for(int i=1;i<=n;i++)c[rk[i]=s[i]]++;
        for(int i=1;i<=m;i++)c[i]+=c[i-1];
        for(int i=n;i>=1;i--)sa[c[rk[i]]--]=i;
        for(int k=1;k<=n;k<<=1){
            int num=0;
            for(int i=n-k+1;i<=n;i++)sa2[++num]=i;
            for(int i=1;i<=n;i++)if(sa[i]>k)sa2[++num]=sa[i]-k;
            for(int i=1;i<=m;i++)c[i]=0;
            for(int i=1;i<=n;i++)c[rk[i]]++;
            for(int i=1;i<=m;i++)c[i]+=c[i-1];
            for(int i=n;i>=1;i--)sa[c[rk[sa2[i]]]--]=sa2[i];
            num=1;swap(rk,sa2);
            rk[sa[1]]=1;
            for(int i=2;i<=n;i++){
                rk[sa[i]]=(sa2[sa[i]]==sa2[sa[i-1]]&&sa2[sa[i]+k]==sa2[sa[i-1]+k])?num:++num;
            }
            m=num;
            if(m==n)break;
        }
        int k=0;
        for(int i=1;i<=n;i++){
            int j=sa[rk[i]-1];
            if(k)k--;
            while(max(i,j)+k<=n&&s[i+k]==s[j+k])k++;
            height[rk[i]]=k;
        }
        for(int i=1;i<=n;i++)dp[i][0]=height[i];
        for(int j=1;(1<<j)<=n;j++){
            for(int i=1;i+(1<<j)-1<=n;i++){
                dp[i][j]=min(dp[i][j-1],dp[i+(1<<j-1)][j-1]);
            }
        }
    }
    int lca(int l,int r){
        l=rk[l],r=rk[r];
        if(l>r)swap(l,r);
        l++;
        int len=lg[r-l+1];
        return min(dp[l][len],dp[r-(1<<len)+1][len]);
    }
};

Suffix Automata Template

struct sam{
    vector<vector<int>>tr;
    vector<int>len,fa;
    int tot=1,now=0,lst=1;
    void init(int n){
        tr=vector<vector<int>>(n*2+10,vector<int>(30,0));
        len=vector<int>(n*2+10,0);
        fa=len;
    }
    void insert(char c){
        c-='a';
        now=++tot;
        len[now]=len[lst]+1;
        while(!tr[lst][c]&&lst){
            tr[lst][c]=now;
            lst=fa[lst];
        }
        if(lst==0)fa[now]=1;
        else {
            int x=tr[lst][c];
            if(len[x]==len[lst]+1){
                fa[now]=x;
            }
            else {
                int y=++tot;
                tr[y]=tr[x];
                fa[y]=fa[x];
                len[y]=len[lst]+1;
                fa[x]=fa[now]=y;
                while(tr[lst][c]==x&&lst){
                    tr[lst][c]=y;
                    lst=fa[lst];
                }
            }
        }
        lst=now;
        cnt[now]=1;
    }
};