/*
 * Decompiled with CFR 0.152.
 */
package org.neoref.spider;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.Date;
import java.util.StringTokenizer;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.apache.oro.text.perl.Perl5Util;
import org.neoref.config.Config;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

public class GeneDBBuilder
extends Thread {
    int start;
    int end;
    int step;
    int batch;
    int interval;
    int checkdup;
    String kingdom;

    GeneDBBuilder(int start, int end, int step, String kingdom, int batch, int interval, int checkdup) {
        this.start = start;
        this.end = end;
        this.step = step;
        this.kingdom = kingdom;
        this.batch = batch;
        this.interval = interval;
        this.checkdup = checkdup;
    }

    public void run() {
        try {
            String locus = "";
            String name = "";
            String symbol = "";
            String type = "";
            String alias = "";
            String geneID = "";
            Perl5Util util = new Perl5Util();
            Class.forName("com.mysql.jdbc.Driver").newInstance();
            Connection dbConn = DriverManager.getConnection(Config.SEARCHDB_STR);
            Statement st = dbConn.createStatement();
            URL esearchURL = new URL("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term=" + this.kingdom + "&usehistory=y");
            URLConnection conn = esearchURL.openConnection();
            InputStream is = conn.getInputStream();
            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
            DocumentBuilder db = dbf.newDocumentBuilder();
            Document xmlDoc = db.parse(is);
            int count = Integer.parseInt(xmlDoc.getElementsByTagName("Count").item(0).getFirstChild().getNodeValue());
            String webEnv = xmlDoc.getElementsByTagName("WebEnv").item(0).getFirstChild().getNodeValue();
            int query_key = Integer.parseInt(xmlDoc.getElementsByTagName("QueryKey").item(0).getFirstChild().getNodeValue());
            if (this.end < 0) {
                this.end = count - 1;
            } else if (this.end > count) {
                this.end = count - 1;
            }
            int k = 0;
            for (int retstart = this.start; retstart <= this.end; retstart += this.step) {
                if (retstart + this.step > this.end && this.step > 1) {
                    this.step = 1;
                }
                GeneDBBuilder.log("Fetch a list of genes...");
                URL efetchURL = new URL("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&WebEnv=" + webEnv + "&query_key=" + query_key + "&retstart=" + retstart + "&retmax=" + this.step + "&retmode=xml");
                conn = efetchURL.openConnection();
                is = conn.getInputStream();
                dbf = DocumentBuilderFactory.newInstance();
                db = dbf.newDocumentBuilder();
                xmlDoc = db.parse(is);
                NodeList locusNodeList = xmlDoc.getElementsByTagName("Gene-ref_locus-tag");
                NodeList nameNodeList = xmlDoc.getElementsByTagName("Prot-ref_name_E");
                NodeList geneIDNodeList = xmlDoc.getElementsByTagName("Gene-track_geneid");
                int numGene = geneIDNodeList.getLength();
                GeneDBBuilder.log("Process " + numGene + " gene(s)...");
                for (int i = 0; i < numGene; ++i) {
                    Node geneIDNode;
                    Node nameNode;
                    Node locusNode;
                    locus = "";
                    if (locusNodeList != null && (locusNode = locusNodeList.item(i)) != null && locusNode.hasChildNodes()) {
                        locus = locusNode.getFirstChild().getNodeValue();
                    }
                    name = "";
                    symbol = "";
                    if (nameNodeList != null && (nameNode = nameNodeList.item(i)) != null && nameNode.hasChildNodes()) {
                        StringTokenizer stk;
                        name = nameNode.getFirstChild().getNodeValue();
                        if (name.indexOf("(") != -1 && (stk = new StringTokenizer(name, "(")).countTokens() > 1) {
                            name = stk.nextToken();
                            symbol = stk.nextToken();
                            symbol = symbol.substring(0, symbol.indexOf(")"));
                        }
                        name = util.substitute("s/['-,]/ /g", name);
                        symbol = util.substitute("s/['-,]/ /g", symbol);
                    }
                    geneID = "";
                    if (geneIDNodeList != null && (geneIDNode = geneIDNodeList.item(i)) != null && geneIDNode.hasChildNodes()) {
                        geneID = geneIDNode.getFirstChild().getNodeValue();
                    }
                    if (this.checkdup == 1) {
                        GeneDBBuilder.log("Checking duplicate for (GENEID: " + geneID + ")");
                        ResultSet rs = st.executeQuery("select * from gene where geneid='" + geneID + "'");
                        if (rs.next()) {
                            GeneDBBuilder.log("Found duplicate, skip gene " + this.start + " (GENEID: " + geneID + ")");
                            ++this.start;
                            if (k >= this.batch) {
                                System.out.println("sleeping...");
                                GeneDBBuilder.sleep(this.interval);
                                k = 0;
                                continue;
                            }
                            ++k;
                            continue;
                        }
                    }
                    GeneDBBuilder.log("Adding gene (GENEID: " + geneID + ") to the database...");
                    st.executeUpdate("insert into gene (locus, name, symbol, type, alias, geneid) values ('" + locus + "','" + name + "','" + symbol + "','" + type + "','" + alias + "','" + geneID + "')");
                    ++this.start;
                    if (k >= this.batch) {
                        System.out.println("sleeping...");
                        GeneDBBuilder.sleep(this.interval);
                        k = 0;
                    } else {
                        ++k;
                    }
                    GeneDBBuilder.log("Gene " + this.start + " (GENEID: " + geneID + ") saved!");
                }
            }
        }
        catch (Exception e) {
            GeneDBBuilder.log("General Error Message: " + e.getMessage());
        }
    }

    public static void log(String entry) {
        System.out.println(new Date() + ":" + entry);
    }

    public static void main(String[] args) throws Exception {
        String cfgStr;
        String kingdom = "";
        int start = 0;
        int end = 0;
        int step = 0;
        int batch = 1;
        int interval = 0;
        int checkdup = 0;
        if (args.length < 1) {
            System.out.println("Usage: geneDBBuild config-file");
            System.exit(-1);
        }
        BufferedReader cfgIn = new BufferedReader(new FileReader(args[0]));
        while ((cfgStr = cfgIn.readLine()) != null) {
            if (cfgStr.startsWith("#")) continue;
            StringTokenizer st = new StringTokenizer(cfgStr, " ");
            if (st.hasMoreTokens()) {
                start = Integer.parseInt(st.nextToken()) - 1;
            } else {
                System.out.println("bad config file");
                System.exit(-1);
            }
            if (st.hasMoreTokens()) {
                end = Integer.parseInt(st.nextToken()) - 1;
            } else {
                System.out.println("bad config file");
                System.exit(-1);
            }
            if (st.hasMoreTokens()) {
                step = Integer.parseInt(st.nextToken());
            } else {
                System.out.println("bad config file");
                System.exit(-1);
            }
            if (st.hasMoreTokens()) {
                kingdom = st.nextToken();
            } else {
                System.out.println("bad config file");
                System.exit(-1);
            }
            if (st.hasMoreTokens()) {
                batch = Integer.parseInt(st.nextToken());
            } else {
                System.out.println("bad config file");
                System.exit(-1);
            }
            if (st.hasMoreTokens()) {
                interval = Integer.parseInt(st.nextToken()) * 1000;
            } else {
                System.out.println("bad config file");
                System.exit(-1);
            }
            if (st.hasMoreTokens()) {
                checkdup = Integer.parseInt(st.nextToken());
            } else {
                System.out.println("bad config file");
                System.exit(-1);
            }
            GeneDBBuilder.log("Build gene database for " + kingdom);
            GeneDBBuilder gdbd = new GeneDBBuilder(start, end, step, kingdom, batch, interval, checkdup);
            gdbd.start();
            gdbd.join();
        }
    }
}

