[LeetCode] Repeated DNA Sequences hash map

All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.

Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.

For example,

Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT",

Return:
["AAAAACCCCC", "CCCCCAAAAA"].
Hide Tags
 Hash Table Bit Manipulation
 

  C++ 标准模板库不常用就容易忘,这个就是用hash map 做一个大表统计的,但是直接unordered_map<string, int > 这样会爆内存。
class Solution {
public:
    vector<string> findRepeatedDnaSequences(string s) {
        unordered_map<string,int > mp;
        int len = s.length(),curIdx = 0;
        string curStr;
        vector<string >ret;
        while(curIdx + 10<=len){
            curStr = s.substr(curIdx,10);
            if(mp.find(curStr)!=mp.end()){
                ret.push_back(curStr);
            }
            else
                mp[curStr] = 1;
            curIdx ++;
        }
        return ret;
    }
};

  处理方法是 可以是将其改为 unordered_map<int ,int >,通过 4进制的转换。另外更可以通过 bitset 再次降低内存,最后需要考虑重复问题,如果用 unordered_map 可以直接标记时候已经添加到返回vector 中了, 用 bitset 可以通过 临时变量 set<string> 存储,最后生成返回的  vector。

#include <iostream>
#include <string>
#include <vector>
#include <unordered_map>
#include <bitset>
#include <set>
using namespace std;

//class Solution {
//public:
//    vector<string> findRepeatedDnaSequences(string s) {
//        unordered_map<string,int > mp;
//        int len = s.length(),curIdx = 0;
//        string curStr;
//        vector<string >ret;
//        while(curIdx + 10<=len){
//            curStr = s.substr(curIdx,10);
//            if(mp.find(curStr)!=mp.end()){
//                ret.push_back(curStr);
//            }
//            else
//                mp[curStr] = 1;
//            curIdx ++;
//        }
//        return ret;
//    }
//};

class Solution {
public:
    vector<string> findRepeatedDnaSequences(string s) {
        bitset<1048576> bst;
        bst.reset();
        set<string > ret;
        int sum=0;
        for(int i =0;i<10;i++)
            sum = sum*4 + helpFun(s[i]);
        bst.set(sum);
        for( int i=10;i<s.length();i++){
            sum%=262144;
            sum = sum*4 + helpFun(s[i]);
            if(bst[sum])
                ret.insert(s.substr(i-9,10));
            else
                bst.set(sum);
        }
        return vector<string>(ret.begin(),ret.end());
    }

    int helpFun(char c)
    {
        switch(c){
            case 'A':   return 0;
            case 'C':   return 1;
            case 'G':   return 2;
            case 'T':   return 3;
        }
    }
};

int main()
{
    string s= "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT";
    Solution sol;
    vector<string > ret = sol.findRepeatedDnaSequences(s);
    for(int i=0;i<ret.size();i++)
        cout<<ret[i]<<endl;
    return 0;
}