#include "reader.hpp"

#include "ids.hpp"
#include <vector>
#include <set>
#include <iostream>
#include <cstring>

// next time do a proper parser allowing to look forward, using a state variable and automate precedence rules
class parser_refs {
  public:

    parser_refs(readbuffer *rb,idsconv *ids, const char* outfile, const char* outfile_fr = "/dev/null") : _fileb(rb), _idsconv(ids) {_outfile.open(outfile, std::ios_base::out| std::ios_base::binary);_outfile_fr.open(outfile_fr, std::ios_base::out| std::ios_base::binary);}
    ~parser_refs(){
      _outfile.close();
      _outfile_fr.close();
    }
    
    void write_inv_to_file(std::ofstream &fh) {
      fh << "Missing: "<<_filenotfound<<" File:"<<std::endl;
      fh << "Missing: "<<_usernotfound<<" User:"<<std::endl;
      fh << "Missing: "<<_wiktnotfound<<" interlink"<<std::endl;
      fh << "Missing: "<<_spnotfound<<" special"<<std::endl;
      fh << "Missing: "<<_lgnotfound<<" link to other language"<<std::endl;
      for (size_t i = 0; i < _invalid.size(); ++i) {
        fh << _invalid[i] <<std::endl;
      }
    }
    void parse() {
      goto match_id;

    match_id: {
      while(_fileb->full()) {
        if (_fileb->get() == '<') {
          if (_fileb->get() == 'i') {
            if (_fileb->get() == 'd') {
              while(_fileb->get() != '>') {;}
              goto get_id;
            }
          }
        }
      }
    }
    return;
    match_text: {
      while(_fileb->full()) {
        if (_fileb->get() == '<') {
          if (_fileb->get() == 't') {
            if (_fileb->get() == 'e') {
              if (_fileb->get() == 'x') {
                if (_fileb->get() == 't') {
                  for (char tmp = _fileb->get(); tmp != '>'; tmp = _fileb->get()) {
                    if (tmp == '/') {
                      write_current();
                      goto match_id;
                    }
                  }
                  goto get_text;
                }
              }
            }
          }
        }
      }
    }
    return;
    get_id: {
      int id = 0;
      while(_fileb->full()) {
        char val = _fileb->get();
        if (val == '<') {
          _cid = id;
          std::cout<<"\r"<<"Current id: " << _cid << std::flush;
          goto match_text;
        } else {
          id = id*10 + val - '0';
        }
      }
    }
    return;
    get_text: {
      std::vector<char> buff;
      while(_fileb->full()) {
        char val = _fileb->get();
        /*if (val == '&') { // check for comment outside reccord loop, as some contains malformed link
          val = check_for_xml_comment(buff);
          buff.resize(0); // Could use an overload without buff
        }*/ // Too complicated, <nowiki></nowiki> and <pre></pre> disable comments and are too complex to handle
        if (val == '<') { // check if we exit text bloc
          emergencytextexit:
          if (_fileb->get() == '/') {
            if (_fileb->get() == 't') {
              if (_fileb->get() == 'e') {
                if (_fileb->get() == 'x') {
                  if (_fileb->get() == 't') {
                    if (_fileb->get() == '>') {
                      write_current();
                      goto match_id; // Warning: may miss a ref and does not check eof
                    }
                  }
                }
              }
            }
          }
        } else if (val == '[') {
          val = _fileb->get();
          if (val == '[') { // start recording
            val = _fileb->get(); // remove starting colon
            if (val == '<') goto emergencytextexit;
            if (val == '&') { 
              val = check_for_xml_comment(buff); // discard potential comment and return next val 
            }
            if (val == ']' || val == '#' || val == '|') {
              buff.resize(0);
              continue; // empty reference or refence to self (with #), exit recording block
            } else if (val != ':') {
              buff.emplace_back(val);
            } // add value unless it start by ':' (see colon_trick)
            while(_fileb->full()) {
              val = _fileb->get();
              if (val == '&') {
                val = check_for_xml_comment(buff); // discard potential comment and return next val 
              }
              if (val == '<') { // Emergency exit of malformed text
                buff.resize(0);
                goto emergencytextexit; 
              }
              if (val == ']' || val == '#' || val == '|') { // end of link,
                buff.emplace_back(0); // terminate string
                int id = _idsconv->name_to_id(buff.data());
                if (id < 0) {
                  if (start_with(buff.data(),"file:")) {
                    _filenotfound++;
                  } else if (start_with(buff.data(),"image:")) { // Alias of File:
                    _filenotfound++;
                  } else if (start_with(buff.data(),"user:")) {  
                    _usernotfound++;
                  } else if (start_with(buff.data(),"user talk:")) {  
                    _usernotfound++;
                  } else if (start_with(buff.data(),"user_talk:")) {  
                    _usernotfound++;
                  } else if (start_with(buff.data(),"wiktionary:")) {  
                    _wiktnotfound++;
                  } else if (start_with(buff.data(),"wikt:")) {  
                    _wiktnotfound++;
                  } else if (start_with(buff.data(),"wikinews:")) {  
                    _wiktnotfound++;
                  } else if (start_with(buff.data(),"n:")) {  
                    _wiktnotfound++;
                  } else if (start_with(buff.data(),"wikibooks:")) {  
                    _wiktnotfound++;
                  } else if (start_with(buff.data(),"b:")) {  
                    _wiktnotfound++;
                  } else if (start_with(buff.data(),"wikiquote:")) {  
                    _wiktnotfound++;
                  } else if (start_with(buff.data(),"q:")) {  
                    _wiktnotfound++;
                  } else if (start_with(buff.data(),"wikisource:")) {  
                    _wiktnotfound++;
                  } else if (start_with(buff.data(),"s:")) {  
                    _wiktnotfound++;
                  } else if (start_with(buff.data(),"wikispecies:")) {  
                    _wiktnotfound++;
                  } else if (start_with(buff.data(),"species:")) {  
                    _wiktnotfound++;
                  } else if (start_with(buff.data(),"wikiversity:")) {  
                    _wiktnotfound++;
                  } else if (start_with(buff.data(),"v:")) {  
                    _wiktnotfound++;
                  } else if (start_with(buff.data(),"wikivoyage:")) {  
                    _wiktnotfound++;
                  } else if (start_with(buff.data(),"voy:")) {  
                    _wiktnotfound++;
                  } else if (start_with(buff.data(),"wikimedia:")) {  
                    _wiktnotfound++;
                  } else if (start_with(buff.data(),"foundation:")) {  
                    _wiktnotfound++;
                  } else if (start_with(buff.data(),"wmf:")) {  
                    _wiktnotfound++;
                  } else if (start_with(buff.data(),"commons:")) {  
                    _wiktnotfound++;
                  } else if (start_with(buff.data(),"c:")) {  
                    _wiktnotfound++;
                  } else if (start_with(buff.data(),"wp:")) {  // shortcut
                    _spnotfound++;
                  } else if (start_with(buff.data(),"simple:")) {  
                    _lgnotfound++;
                  } else if (start_with(buff.data(),"fr:")) {  
                    _lgnotfound++;
                  } else if (start_with(buff.data(),"de:")) {  
                    _lgnotfound++;
                  } else if (start_with(buff.data(),"ja:")) {  
                    _lgnotfound++;
                  } else if (start_with(buff.data(),"es:")) {  
                    _lgnotfound++;
                  } else if (start_with(buff.data(),"ru:")) {  
                    _lgnotfound++;
                  } else if (start_with(buff.data(),"pt:")) {  
                    _lgnotfound++;
                  } else if (start_with(buff.data(),"zh:")) {  
                    _lgnotfound++;
                  } else if (start_with(buff.data(),"it:")) {  
                    _lgnotfound++;
                  } else if (start_with(buff.data(),"ar:")) {  
                    _lgnotfound++;
                  } else if (start_with(buff.data(),"fa:")) {  
                    _lgnotfound++;
                  } else if (start_with(buff.data(),"pl:")) {  
                    _lgnotfound++;
                  } else if (start_with(buff.data(),"nl:")) {  
                    _lgnotfound++;
                  } else if (start_with(buff.data(),"uk:")) {  
                    _lgnotfound++;
                  } else if (start_with(buff.data(),"id:")) {  
                    _lgnotfound++;
                  } else if (start_with(buff.data(),"he:")) {  
                    _lgnotfound++;
                  } else if (start_with(buff.data(),"tr:")) {  
                    _lgnotfound++;
                  } else {
                    _invalid.emplace_back(buff.data());
                  }
                } else {
                  _crefs.insert(id);
                  if (_firstref == 0) {
                    _firstref = id;
                  }
                }
                buff.resize(0);
                break;
              } else {
                buff.emplace_back(val);
              }
            } // write ref dest
          } else if (val == '<') goto emergencytextexit;
        } // continue
      } // while
    } 
    }


    void write_current() {
      _outfile.write(reinterpret_cast<char*>(&_cid),sizeof(int));
      int tmp = _crefs.size();
      _outfile.write(reinterpret_cast<char*>(&tmp),sizeof(int));
      for (auto it = _crefs.cbegin(); it != _crefs.cend(); ++it){
        tmp = *it;
        _outfile.write(reinterpret_cast<char*>(&tmp),sizeof(int));
      }
      _crefs.clear();
      // write first ref
      _outfile_fr << _cid<<"\t"<<_firstref<<"\n";
      _firstref = 0;
    }

    bool start_with(char const *buff,char const* str) {
      size_t itt = 0;
      while(str[itt] != 0) {
        if (buff[itt] == 0) return false; // buff too small
        if (static_cast<char>(std::tolower(static_cast<unsigned char>(buff[itt]))) != str[itt]) return false;
        ++itt;
      }
      return true;
    }
    
    char check_for_xml_comment(std::vector<char> &buff) {
      char tmp[6];
      int nb = 0;
      tmp[nb++] = _fileb->get();
      if (tmp[nb-1] == 'l') {
        tmp[nb++] = _fileb->get();
        if (tmp[nb-1] == 't') {
          tmp[nb++] = _fileb->get();
          if (tmp[nb-1] == ';') {
            tmp[nb++] = _fileb->get();
            if (tmp[nb-1] == '!') {
              tmp[nb++] = _fileb->get();
              if (tmp[nb-1] == '-') {
                tmp[nb] = _fileb->get();
                if (tmp[nb] == '-') { // enter a comment block
                  retry:
                  for(char ftmp = _fileb->get(); ftmp != '-'; ftmp = _fileb->get()) {
                    if (ftmp == '<') { // Emergency exit 
                      return '<';
                    }
                  }
                  if (_fileb->get() == '-') {
                    retryondash:
                    char stmp = _fileb->get();
                    if (stmp == '&') {
                      if (_fileb->get() == 'g' && _fileb->get() == 't' &&
                          _fileb->get() == ';') return _fileb->get();
                      goto retry;
                    } else if (stmp == '-') {
                      goto retryondash; // still on a sequence of at least two '-'
                    }
                  } else {
                    goto retry;
                  }
                }
              }
            }
          }
        }
      }
      // was not a comment block, replace back the data, Warning this can miss the strat of a block like &lt;
      buff.emplace_back('&');
      for (int j = 0; j < nb-1; ++j) {
        buff.emplace_back(tmp[j]);
      }
      return tmp[nb-1];
    }

    private:
      std::ofstream _outfile;
      std::ofstream _outfile_fr;
      readbuffer * _fileb;
      idsconv *_idsconv;
      std::vector<std::string> _invalid;
      int _cid;
      std::set<int> _crefs;
      int _firstref = 0;
      int _filenotfound = 0;
      int _usernotfound = 0;
      int _wiktnotfound = 0;
      int _spnotfound = 0;
      int _lgnotfound = 0;
};

