/*
 * Decompiled with CFR 0.152.
 */
package org.neoref.spider;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.Date;
import java.util.StringTokenizer;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.neoref.config.Config;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

public class Fetcher
extends Thread {
    int start;
    int end;
    int step;
    int interval;
    int checkdup;
    String journalName;
    String journalUrl;
    String journalUrlEnd;
    String journalUrlAlt;
    String journalUrlAltEnd;

    Fetcher(int start, int end, int step, String journalName, String journalUrl, String journalUrlEnd, int interval, int checkdup, String journalUrlAlt, String journalUrlAltEnd) {
        this.start = start;
        this.end = end;
        this.step = step;
        this.journalName = journalName;
        this.journalUrl = journalUrl;
        this.journalUrlEnd = journalUrlEnd;
        this.interval = interval;
        this.checkdup = checkdup;
        this.journalUrlAlt = journalUrlAlt;
        this.journalUrlAltEnd = journalUrlAltEnd;
    }

    public void run() {
        try {
            String articlePath = new String();
            PDDocument pdfDoc = new PDDocument();
            PDFTextStripper stripper = new PDFTextStripper();
            Class.forName("com.mysql.jdbc.Driver").newInstance();
            Connection dbConn = DriverManager.getConnection(Config.SEARCHDB_STR);
            Statement st = dbConn.createStatement();
            URL esearchURL = new URL("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=" + this.journalName + "[ta]&usehistory=y");
            URLConnection conn = esearchURL.openConnection();
            InputStream is = conn.getInputStream();
            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
            DocumentBuilder db = dbf.newDocumentBuilder();
            Document xmlDoc = db.parse(is);
            int count = Integer.parseInt(xmlDoc.getElementsByTagName("Count").item(0).getFirstChild().getNodeValue());
            String webEnv = xmlDoc.getElementsByTagName("WebEnv").item(0).getFirstChild().getNodeValue();
            int query_key = Integer.parseInt(xmlDoc.getElementsByTagName("QueryKey").item(0).getFirstChild().getNodeValue());
            if (this.end < 0) {
                this.end = count - 1;
            } else if (this.end > count) {
                this.end = count - 1;
            }
            for (int retstart = this.start; retstart <= this.end; retstart += this.step) {
                if (retstart + this.step > this.end && this.step > 1) {
                    this.step = 1;
                }
                Fetcher.log("Fetch a list of articles...");
                URL efetchURL = new URL("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&WebEnv=" + webEnv + "&query_key=" + query_key + "&retstart=" + retstart + "&retmax=" + this.step + "&retmode=xml");
                conn = efetchURL.openConnection();
                is = conn.getInputStream();
                dbf = DocumentBuilderFactory.newInstance();
                db = dbf.newDocumentBuilder();
                xmlDoc = db.parse(is);
                NodeList articleListNodeList = xmlDoc.getElementsByTagName("ArticleIdList");
                NodeList medlineCitationNodeList = xmlDoc.getElementsByTagName("MedlineCitation");
                NodeList volNodeList = xmlDoc.getElementsByTagName("Volume");
                NodeList issueNodeList = xmlDoc.getElementsByTagName("Issue");
                NodeList pgnNodeList = xmlDoc.getElementsByTagName("MedlinePgn");
                NodeList statusNodeList = xmlDoc.getElementsByTagName("PublicationStatus");
                NodeList titleNodeList = xmlDoc.getElementsByTagName("ArticleTitle");
                NodeList abstractNodeList = xmlDoc.getElementsByTagName("AbstractText");
                int numArticle = articleListNodeList.getLength();
                Fetcher.log("Process " + numArticle + " article(s)...");
                for (int i = 0; i < numArticle; ++i) {
                    Node titleNode;
                    String pmid;
                    Node articleListNode = articleListNodeList.item(i);
                    NodeList articleNodeList = articleListNode.getChildNodes();
                    Fetcher.log("Check article status");
                    String pubStatus = "";
                    Node statusNode = statusNodeList.item(i);
                    if (statusNode.hasChildNodes()) {
                        pubStatus = statusNode.getFirstChild().getNodeValue();
                    }
                    int numArticleID = articleNodeList.getLength();
                    boolean doiArticle = false;
                    boolean pubmedArticle = false;
                    boolean piiArticle = false;
                    String urlStr = "";
                    for (int j = 1; j < numArticleID; j += 2) {
                        int pos;
                        Node articleNode = articleNodeList.item(j);
                        NamedNodeMap nodeMap = articleNode.getAttributes();
                        Node attr = nodeMap.getNamedItem("IdType");
                        if (pubStatus.equals("aheadofprint")) {
                            if (!attr.getNodeValue().equals("doi") || this.journalName.equals("american+journal+of+human+genetics") || pubmedArticle || piiArticle) continue;
                            Fetcher.log("Use doi to build url");
                            doiArticle = true;
                            articlePath = articleNode.getFirstChild().getNodeValue();
                            pos = articlePath.indexOf(47);
                            articlePath = articlePath.substring(pos);
                            if (this.journalName.equals("journal+of+experimental+botany")) {
                                pos = articlePath.indexOf("eri");
                                if (pos == -1) {
                                    pos = articlePath.indexOf("erh");
                                }
                                if (pos == -1) {
                                    Fetcher.log("Invalid article path");
                                    continue;
                                }
                                articlePath = "/" + articlePath.substring(pos);
                            }
                            if (this.journalName.equals("pnas")) {
                                pos = articlePath.indexOf(".");
                                if (pos == -1) {
                                    Fetcher.log("Invalid article path");
                                    continue;
                                }
                                articlePath = "/" + articlePath.substring(pos + 1);
                            }
                            urlStr = this.journalUrl + articlePath + this.journalUrlEnd;
                            continue;
                        }
                        if (attr.getNodeValue().equals("pubmed") && !this.journalName.equals("american+journal+of+human+genetics") && !doiArticle && !piiArticle) {
                            Fetcher.log("Use pubmed to build url");
                            pubmedArticle = true;
                            Node volNode = volNodeList.item(i);
                            if (volNode == null || !volNode.hasChildNodes()) continue;
                            String volume = volNode.getFirstChild().getNodeValue();
                            Node issueNode = issueNodeList.item(i);
                            if (issueNode == null || !issueNode.hasChildNodes()) continue;
                            String issue = issueNode.getFirstChild().getNodeValue();
                            Node pgnNode = pgnNodeList.item(i);
                            if (pgnNode == null || !pgnNode.hasChildNodes()) continue;
                            String medlinepgn = pgnNode.getFirstChild().getNodeValue();
                            pos = medlinepgn.indexOf("-");
                            if (pos != -1) {
                                medlinepgn = medlinepgn.substring(0, pos);
                            }
                            urlStr = this.journalUrlAlt + "/" + volume + "/" + issue + "/" + medlinepgn + this.journalUrlAltEnd;
                            continue;
                        }
                        if (!attr.getNodeValue().equals("pii") || doiArticle || pubmedArticle) continue;
                        Fetcher.log("Use pii to build url");
                        if (this.journalName.equals("pnas")) {
                            Fetcher.log("Skip pii for pnas");
                            continue;
                        }
                        piiArticle = true;
                        articlePath = articleNode.getFirstChild().getNodeValue();
                        urlStr = this.journalUrl + articlePath + this.journalUrlEnd;
                    }
                    if (!doiArticle && !pubmedArticle && !piiArticle) continue;
                    Node medlineCitationNode = medlineCitationNodeList.item(i);
                    NodeList medlineCitationChildNodeList = medlineCitationNode.getChildNodes();
                    Node pmidNode = medlineCitationChildNodeList.item(1);
                    if (pmidNode.hasChildNodes()) {
                        pmid = pmidNode.getFirstChild().getNodeValue();
                        if (this.checkdup == 1) {
                            Fetcher.log("Checking duplicate for (PMID: " + pmid + ")");
                            ResultSet rs = st.executeQuery("select * from metadata where pmid='" + pmid + "'");
                            if (rs.next()) {
                                Fetcher.log("Found duplicate, skip article (PMID: " + pmid + ")");
                                ++this.start;
                                continue;
                            }
                        }
                    } else {
                        pmid = "";
                    }
                    if ((titleNode = titleNodeList.item(i)) == null || !titleNode.hasChildNodes()) continue;
                    String titleStr = titleNode.getFirstChild().getNodeValue();
                    Node abstractNode = abstractNodeList.item(i);
                    if (abstractNode == null || !abstractNode.hasChildNodes()) continue;
                    String abstractStr = abstractNode.getFirstChild().getNodeValue();
                    try {
                        Fetcher.log("Download an article from: " + urlStr);
                        URL articleURL = new URL(urlStr);
                        conn = articleURL.openConnection();
                        is = conn.getInputStream();
                    }
                    catch (Exception e) {
                        Fetcher.log("Error: cannot access (PMID: " + pmid + "), message is " + e.getMessage());
                        ++this.start;
                        Fetcher.sleep(this.interval);
                        continue;
                    }
                    try {
                        Fetcher.log("Parsing PDF");
                        PDFParser parser = new PDFParser(is);
                        parser.parse();
                        pdfDoc = parser.getPDDocument();
                        String txtStr = stripper.getText(pdfDoc).replace('\n', ' ');
                        txtStr = txtStr.replace('\'', ' ');
                        titleStr = titleStr.replace('\'', ' ');
                        abstractStr = abstractStr.replace('\'', ' ');
                        Fetcher.log("Adding article (PMID: " + pmid + ") to the database...");
                        st.executeUpdate("insert into metadata (title, subject, creator, source, relation, contributor, format, identifier, language, coverage, rights, publisher, date, description, type, fulltxt, pmid) values ('" + titleStr + "', ' ', ' ', ' ', ' ', ' ', ' ', '" + urlStr + "', ' ', ' ', ' ', ' ', ' ', '" + abstractStr + "', ' ', '" + txtStr + "', '" + pmid + "')");
                        pdfDoc.close();
                        ++this.start;
                        Fetcher.sleep(this.interval);
                        Fetcher.log("Article number " + this.start + " (PMID: " + pmid + ") saved!");
                        continue;
                    }
                    catch (Exception e) {
                        Fetcher.log("PDF Error Message: " + e.getMessage());
                        if (pdfDoc != null) {
                            pdfDoc.close();
                        }
                        ++this.start;
                    }
                }
            }
        }
        catch (Exception e) {
            Fetcher.log("General Error Message: " + e.getMessage());
        }
    }

    public static void log(String entry) {
        System.out.println(new Date() + ":" + entry);
    }

    public static void main(String[] args) throws Exception {
        String cfgStr = "";
        String journalName = "";
        String journalUrl = "";
        String journalUrlEnd = "";
        String journalUrlAlt = "";
        String journalUrlAltEnd = "";
        int start = 0;
        int end = 0;
        int step = 0;
        int interval = 0;
        int checkdup = 0;
        if (args.length < 1) {
            System.out.println("Usage: fetch config-file");
            System.exit(-1);
        }
        BufferedReader cfgIn = new BufferedReader(new FileReader(args[0]));
        while ((cfgStr = cfgIn.readLine()) != null) {
            if (cfgStr.startsWith("#")) continue;
            StringTokenizer st = new StringTokenizer(cfgStr, " ");
            if (st.hasMoreTokens()) {
                start = Integer.parseInt(st.nextToken()) - 1;
            } else {
                System.out.println("bad config file");
                System.exit(-1);
            }
            if (st.hasMoreTokens()) {
                end = Integer.parseInt(st.nextToken()) - 1;
            } else {
                System.out.println("bad config file");
                System.exit(-1);
            }
            if (st.hasMoreTokens()) {
                step = Integer.parseInt(st.nextToken());
            } else {
                System.out.println("bad config file");
                System.exit(-1);
            }
            if (st.hasMoreTokens()) {
                journalName = st.nextToken();
            } else {
                System.out.println("bad config file");
                System.exit(-1);
            }
            if (st.hasMoreTokens()) {
                journalUrl = st.nextToken();
            } else {
                System.out.println("bad config file");
                System.exit(-1);
            }
            if (st.hasMoreTokens()) {
                journalUrlEnd = st.nextToken();
            } else {
                System.out.println("bad config file");
                System.exit(-1);
            }
            if (st.hasMoreTokens()) {
                interval = Integer.parseInt(st.nextToken()) * 1000;
            } else {
                System.out.println("bad config file");
                System.exit(-1);
            }
            if (st.hasMoreTokens()) {
                checkdup = Integer.parseInt(st.nextToken());
            } else {
                System.out.println("bad config file");
                System.exit(-1);
            }
            if (st.hasMoreTokens()) {
                journalUrlAlt = st.nextToken();
            }
            if (st.hasMoreTokens()) {
                journalUrlAltEnd = st.nextToken();
            }
            Fetcher.log("Fetching from " + journalName);
            Fetcher ft = new Fetcher(start, end, step, journalName, journalUrl, journalUrlEnd, interval, checkdup, journalUrlAlt, journalUrlAltEnd);
            ft.start();
            ft.join();
        }
    }
}

