<?php
/*******************************************************************************
 *
 * LEIDEN OPEN VARIATION DATABASE (LOVD)
 *
 * Created     : 2011-04-05
 * Modified    : 2012-03-14
 * For LOVD    : 2.0-34
 *
 * Access      : Public
 * Purpose     : Proof-of-concept VarioML exporter for LOVD 2.0.
 *               Supported:
 *  2.0-34       /api/rest.php/export
 *  2.0-34       /api/rest.php/export/{{ GENE }}
 *  2.0-34       /api/rest.php/export/{{ INDIVIDUAL_ID }}
 *
 * Copyright   : 2004-2012 Leiden University Medical Center; http://www.LUMC.nl/
 * Programmer  : Ing. Ivo F.A.C. Fokkema <I.F.A.C.Fokkema@LUMC.nl>
 * Last edited : Ing. Ivo F.A.C. Fokkema <I.F.A.C.Fokkema@LUMC.nl>
 *
 *
 * This file is part of LOVD.
 *
 * LOVD is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * LOVD is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with LOVD; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 *************/

define('ROOT_PATH', '../');
require ROOT_PATH . 'inc-init.php';

// Only for managers!!!
if (!$_AUTH || $_AUTH['level'] < LEVEL_MANAGER) {
    require ROOT_PATH . 'inc-top-clean.php'; // Otherwise, we could have just run lovd_requireAuth() without this if().
    lovd_requireAuth(LEVEL_MANAGER);
}
require ROOT_PATH . 'class/feeds.php';

header('Content-type: text/plain; charset=ISO-8859-1'); // In case of error, we'll need a text/plain content-type.

// Will currently only allow GET.
if (!in_array($_SERVER['REQUEST_METHOD'], array('GET'))) {
    header('HTTP/1.0 501 Not Implemented');
    exit;
}

// Define functions.
function lovd_parsePubMed ($s)
{
    // Parses PubMed reference string and returns array with id, uri, author.
    // NOTE: handles one reference only!
    // FIXME; allow for multiple references!
    // FIXME; allow LOVD 3.0 style references!
    if ($s && is_string($s)) {
        if (preg_match('/\{PMID([0-9]+):([^\}]+)\}/', $s, $aRegs)) {
            list(,$nID, $sAuthor) = $aRegs;
            return array('id' => $aRegs[1], 'url' => 'http://www.ncbi.nlm.nih.gov/pubmed/' . $nID, 'author' => $sAuthor);
        }
    }
    return false;
}





// Only one gene/individual selected?
$sSymbol = $nIndividual = '';
if (!empty($_SERVER['PATH_INFO'])) {
    $aPATH_INFO = explode('/', $_SERVER['PATH_INFO']);
    foreach ($aPATH_INFO as $sPath) {
        if (!$sPath) {
            continue;
        }
        if (ctype_digit($sPath)) {
            $nIndividual = $sPath;
        } else {
            $sSymbol = $sPath;
        }
        break;
    }
}
if ($sSymbol && !in_array($sSymbol, lovd_getGeneList())) {
    header('HTTP/1.0 404 Not Found');
    die('Gene ' . $sSymbol . ' does not exist in this LOVD.');
}
if ($nIndividual) {
    // Retrieving patient & variant data will be later, now I just need the genes.
    // FIXME; group_concat would be nice here now...
    $aSymbols = array();
    $q = mysql_query('SELECT p2v.symbol FROM ' . TABLE_PATIENTS . ' AS p LEFT JOIN ' . TABLE_PAT2VAR . ' AS p2v ON (p.patientid = p2v.patientid) WHERE p.patientid = "' . $nIndividual . '" AND p2v.status >= "' . STATUS_MARKED . '"');
    while ($r = mysql_fetch_row($q)) {
        $aSymbols[] = $r[0];
    }
    $sSymbol = implode('","', $aSymbols); // Format to be used in queries.
    if (!$sSymbol) {
        die('Patient ' . $nIndividual . ' does not exist in this LOVD.');
    }
}

if (isset($_GET['download'])) {
    header('Content-Disposition: attachment; filename="LOVD_export_varioml' . ($sSymbol? '_' . $sSymbol : '') . '.txt"');
    // Fix IE download issue when using SSL.
    header('Pragma: public');
}



// We want to have a good variable available to know the format.
$aFormats = array('text/xml', 'text/plain');
if (!empty($_GET['format']) && in_array($_GET['format'], $aFormats)) {
    define('FORMAT', $_GET['format']);
} else {
    define('FORMAT', $aFormats[0]);
}
$_FEED = new Feed();
// Now we've got $sSymbol and FORMAT filled in, if data is available.


/* FIXMEs
- LSDB id?
- Maybe instead of doing it this way, build up a big array that gets translated to XML in the end? Uses more memory but is more flexible.
- The spec gives this example for the <gene> element:
          <db_xref accession="BRACA2" source="HGNC" uri="http://www.genenames.org/data/hgnc_data.php?hgnc_id=1101"/>
  I believe this should be (thus I implemented:):
          <db_xref accession="1101" source="HGNC" uri="http://www.genenames.org/data/hgnc_data.php?hgnc_id=1101"/>
- Can't do <genetic_origin> right now, because the required attribute "term" may not be available.
- What are allowed sources for the <ref_seq> element? I used "genbank" (from example) and "uniprot". Is that OK?
- What do I do with Variant/DBID?

FIXME's in the code.
*/





// First, the source information:
header('Content-type: ' . FORMAT . '; charset=ISO-8859-1'); // Overwrites the previous content-type.
if (!$_CONF['location_url']) {
    $_CONF['location_url'] = PROTOCOL . $_SERVER['HTTP_HOST'] . lovd_cleanDirName(dirname($_SERVER['SCRIPT_NAME']) . '/' . ROOT_PATH);
}
print('<?xml version="1.0" encoding="ISO-8859-1"?>
<lsdb id="LOVD:' . $_STAT['signature'] . '" schema_version="2.0" xmlns="http://gen2phen.org/varioml/2.0"
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://gen2phen.org/lsdb/2.0">

    <source uri="' . $_CONF['location_url'] . '"
        date="' . $_FEED->formatDate(time()) . '">
        <name>' . htmlspecialchars($_CONF['system_title']) . '</name>');

// Per curator, a <contact> element.
// FIXME; use group_concat once this gets to LOVD 3.0!!!
$sSQL = 'SELECT u.userid, u.name, u.email, c.symbol FROM ' . TABLE_USERS . ' AS u INNER JOIN ' . TABLE_CURATES . ' AS c USING (userid)' . (!$sSymbol? '' : ' WHERE c.symbol IN ("' . $sSymbol . '")') . ' ORDER BY u.name, c.symbol';
$q = @mysql_query($sSQL);
$n = @mysql_num_rows($q);
$nPrevID = 0;
while ($z = @mysql_fetch_assoc($q)) {
    if ($nPrevID != $z['userid']) {
        // New user. We're in this crappy method because we can't use GROUP_CONCAT on older servers :(
        if ($nPrevID) {
            // "Close" previous user entry.
            print('</comment>
        </contact>');
        }
        print('
        <contact>
            <name>' . $z['name'] . '</name>
            <email>' . $z['email'] . '</email>
            <comment>Curator of: ' . $z['symbol']);
        $nPrevID = $z['userid'];
    } else {
        print(', ' . $z['symbol']);
    }
}
if ($n) {
    // Close curator and source.
    print('</comment>
        </contact>');
}
print('
    </source>' . "\n");





// Retrieve gene information and put in array.
$aGenes = array();
$q = mysql_query('SELECT symbol, gene, chrom_location, refseq_genomic, refseq_mrna, refseq_build, id_hgnc, id_entrez, id_omim_gene, id_uniprot FROM ' . TABLE_DBS . (!$sSymbol? '' : ' WHERE symbol IN ("' . $sSymbol . '")'));
while ($z = mysql_fetch_assoc($q)) {
    // Create XML that will be repeated for all variants in this gene.
    $z = array_map('htmlspecialchars', $z);
    $z['XML'] = '            <gene source="HGNC" accession="' . $z['symbol'] . '">' . "\n" .
        (!$z['id_hgnc']? '' : '                <db_xref accession="' . $z['id_hgnc'] . '" source="HGNC" uri="http://www.genenames.org/data/hgnc_data.php?hgnc_id=' . $z['id_hgnc'] . '"/>' . "\n") .
        (!$z['id_entrez']? '' : '                <db_xref accession="' . $z['id_entrez'] . '" source="NCBI" uri="http://www.ncbi.nlm.nih.gov/gene/' . $z['id_entrez'] . '"/>' . "\n") .
        (!$z['id_omim_gene']? '' : '                <db_xref accession="' . $z['id_omim_gene'] . '" source="OMIM" uri="http://www.omim.org/entry/' . $z['id_omim_gene'] . '"/>' . "\n") .
                '                <comment term="name">
                    <text>' . $z['gene'] . '</text>
                </comment>
            </gene>' . "\n";
    $z['chromosome'] = preg_replace('/^([0-9]{1,2}|[MXY])([pqtercen0-9.-]+)?$/', "$1", $z['chrom_location']);  // That regexp is not completely correct (not limiting enough), but we don't need that anyway.
    $aGenes[$z['symbol']] = $z;
}

// Pathogenicity values.
$aPathogenicities =
         array(
                5 => 'Not Known',
                9 => 'Pathogenic',
                7 => 'Probably Pathogenic',
                3 => 'Probably Not Pathogenic',
                1 => 'Non-pathogenic',
              );






// Per individual, show data + its variants.
// Simplest approach: get all the patient and pat2var data out of the DB in one
// query (requires significant memory usage on big LOVDs) ordered by patient,
// then walk through the data and "group" by patientid. Basically like the
// curator list above. We can't get to the Variant data in this query, because
// we don't know which table to join to :( Stupid LOVD 2.0! LOVD 3.0 is just so
// much better...
// FIXME; use group_concat once this gets to LOVD 3.0!!!
// Patients are sorted by ID... nothing to do about it, since we *must* split the query *without* the options of subqueries or group_concat.
$sSQL = 'SELECT p.*, p2v.symbol FROM ' . TABLE_PATIENTS . ' AS p INNER JOIN ' . TABLE_PAT2VAR . ' AS p2v USING (patientid) WHERE ' . ($nIndividual? 'p.patientid = "' . $nIndividual . '" AND ' : (!$sSymbol? '' : 'p2v.symbol IN ("' . $sSymbol . '") AND ')) . 'p2v.status >= ' . STATUS_MARKED . ' ORDER BY p.patientid, p2v.symbol';
$qPat = @mysql_query($sSQL);
$n = @mysql_num_rows($qPat);
$nPrevID = 0;
while ($zPat = @mysql_fetch_assoc($qPat)) {
    $zPat = array_map('htmlspecialchars', $zPat);
    if ($nPrevID != $zPat['patientid']) {
        // New patient. We're in this crappy method because we can't use GROUP_CONCAT on older servers :(
        if ($nPrevID) {
            // "Close" previous user entry.
            print('    </individual>' . "\n");
        }
        // FIXME; add Patient/PatientID if allowed to publish.
        print('
    <individual id="' . $zPat['patientid'] . '" uri="' . $_CONF['location_url'] . 'variants.php?action=view&amp;view=' . $zPat['patientid'] . '">
        <original_id accession="' . $zPat['patientid'] . '"><comment><text>actual ID used by submitter is non-public</text></comment></original_id>' . "\n");

        // Gender.
        if (!empty($zPat['Patient/Gender'])) {
            $nGender = (in_array($zPat['Patient/Gender'], array('M', 'Male'))? 1 : (in_array($zPat['Patient/Gender'], array('F', 'Female'))? 2 : 9));
        } else {
            $nGender = 0;
        }
        print('        <gender code="' . $nGender . '"' .
            ($nGender != 9? '/>' : '>
            <description term="' . $zPat['Patient/Gender'] . '"/>
        </gender>') . "\n");

        // Phenotype (Patient/Phenotype/Disease). // FIXME; try to get a better description through the column's settings and/or through the gene's OMIM settings?
        if (empty($zPat['Patient/Phenotype/Disease'])) {
            // I must provide a phenotype element!
            $zPat['Patient/Phenotype/Disease'] = 'unknown';
        }
        $aDiseases = explode(';', $zPat['Patient/Phenotype/Disease']);
        foreach ($aDiseases as $sDisease) {
            print('        <phenotype term="' . $sDisease . '"/>' . "\n");
        }

        // Origin (Patient/Origin/Ethnic, Patient/Origin/Geographic, Patient/Origin/Population).
        $aColumns = array('ethnic' => 'Patient/Origin/Ethnic', 'region' => 'Patient/Origin/Geographic', 'other' => 'Patient/Origin/Population');
        foreach ($aColumns as $sType => $sColumn) {
            if (!empty($zPat[$sColumn])) {
                print('        <population type="' . $sType . '" term="' . $zPat[$sColumn] . '"/>' . "\n");
            }
        }
    }



    // VARIANT DATA. Query, and loop through the data. Unfortunately no other way possible in LOVD 2.0.
    $sSQL = 'SELECT p2v.*, v.* FROM ' . TABLE_PAT2VAR . ' AS p2v INNER JOIN ' . TABLEPREFIX . '_' . $zPat['symbol'] . '_variants AS v USING (variantid) WHERE patientid = "' . $zPat['patientid'] . '" AND p2v.status >= ' . STATUS_MARKED . ' ORDER BY v.sort';
    $qVar = @mysql_query($sSQL);
    while ($zVar = @mysql_fetch_assoc($qVar)) {
        $zVar = array_map('htmlspecialchars', $zVar);
        $sType = (strpos($zVar['Variant/DNA'], 'g.') !== false? 'DNA' : (strpos($zVar['Variant/DNA'], 'c.') !== false? 'cDNA' : ''));
        print('
        <variant id="' . $zVar['variantid'] . '" uri="' . $_CONF['location_url'] . 'variants.php?action=view&amp;view=' . $zPat['patientid'] . '%2C' . $zVar['variantid'] . '%2C' . $zVar['allele'] . '"' . (!$sType? '' : ' type="' . $sType . '"') . '>' . "\n");

        // Gene.
        print($aGenes[$zVar['symbol']]['XML']);

        // Refseq.
        if ($sType == 'DNA' && $aGenes[$zVar['symbol']]['refseq_genomic']) {
            print('            <ref_seq source="genbank" accession="' . $aGenes[$zVar['symbol']]['refseq_genomic'] . '" uri="http://www.ncbi.nlm.nih.gov/nuccore/' . $aGenes[$zVar['symbol']]['refseq_genomic'] . '"/>' . "\n");
        // mRNA will now be the default if variant is not clearly defined.
        // } elseif ($sType == 'cDNA' && $aGenes[$zVar['symbol']]['refseq_mrna']) {
        } elseif ($aGenes[$zVar['symbol']]['refseq_mrna']) {
            print('            <ref_seq source="genbank" accession="' . $aGenes[$zVar['symbol']]['refseq_mrna'] . '" uri="http://www.ncbi.nlm.nih.gov/nuccore/' . $aGenes[$zVar['symbol']]['refseq_mrna'] . '"/>' . "\n");
        }

        // Variant name.
        print('            <name scheme="' . ($sType? 'HGVS' : 'unknown') . '">' . $zVar['Variant/DNA'] . '</name>' . "\n"); // FIXME; Just assuming that this is HGVS, check Mutalyzer fields?

        // Exon.
        if (!empty($zVar['Variant/Exon'])) {
            print('            <exon>' . $zVar['Variant/Exon'] . '</exon>' . "\n");
        }

        // Pathogenicity.
        print('            <pathogenicity scope="individual" source="submitter" term="' . $aPathogenicities[$zVar['pathogenic']{0}] . '"/>' . "\n" .
              '            <pathogenicity scope="individual" source="curator" term="' . $aPathogenicities[$zVar['pathogenic']{1}] . '"/>' . "\n");

        // Variant detection.
        $sTemplate  = (!empty($zVar['Variant/Detection/Template'])? $zVar['Variant/Detection/Template'] : (!empty($zPat['Patient/Detection/Template'])? $zPat['Patient/Detection/Template'] : ''));
        $sTechnique = (!empty($zVar['Variant/Detection/Technique'])? $zVar['Variant/Detection/Technique'] : (!empty($zPat['Patient/Detection/Technique'])? $zPat['Patient/Detection/Technique'] : ''));
        if ($sTemplate || $sTechnique) {
            print('            <variant_detection template="' . $sTemplate . '" technique="' . $sTechnique . '"/>' . "\n");
        }

        // Restriction site.
        if (!empty($zVar['Variant/Restriction_site'])) {
            print('            <restriction_site term="' . $zVar['Variant/Restriction_site'] . '"/>' . "\n");
        }

        // Genetic origin (allele).
        // FIXME; 'term' should be (denovo, sporadic, inherited) but we don't always know this.
//        print('            <genetic_origin term="' . $zVar['allele'] . '"/>' . "\n");

        // Frequency.
        if (!empty($zVar['Variant/Frequency'])) {
            print('            <frequency>
                <freq>' . $zVar['Variant/Frequency'] . '</freq>
            </frequency>' . "\n");
        }



        // RNA, Protein.
        // FIXME; This is just because I'm lazy to implement the possibility that RNA *OR* Protein is available, but not both.
        if (!empty($zVar['Variant/RNA']) && !empty($zVar['Variant/Protein'])) {
            print('            <seq_changes>
                <variant type="RNA">' . "\n");
            if ($aGenes[$zVar['symbol']]['refseq_mrna']) {
                print('                    <ref_seq source="genbank" accession="' . $aGenes[$zVar['symbol']]['refseq_mrna'] . '" uri="http://www.ncbi.nlm.nih.gov/nuccore/' . $aGenes[$zVar['symbol']]['refseq_mrna'] . '"/>' . "\n");
            }

            // Variant name.
            print('                    <name scheme="HGVS">' . $zVar['Variant/RNA'] . '</name>' . "\n"); // Of course just assuming that this is HGVS... :S

            // Protein.
            print('                    <seq_changes>
                        <variant type="AA">' . "\n");
            if ($aGenes[$zVar['symbol']]['id_uniprot']) {
                print('                            <ref_seq source="uniprot" accession="' . $aGenes[$zVar['symbol']]['id_uniprot'] . '" uri="http://www.uniprot.org/uniprot/' . $aGenes[$zVar['symbol']]['id_uniprot'] . '"/>' . "\n");
            } elseif ($aGenes[$zVar['symbol']]['refseq_mrna']) {
                print('                            <ref_seq source="genbank" accession="' . $aGenes[$zVar['symbol']]['refseq_mrna'] . '" uri="http://www.ncbi.nlm.nih.gov/nuccore/' . $aGenes[$zVar['symbol']]['refseq_mrna'] . '"/>' . "\n");
            }

            // Variant name.
            print('                            <name scheme="HGVS">' . $zVar['Variant/Protein'] . '</name>' . "\n"); // Of course just assuming that this is HGVS... :S

            // Close protein, close RNA.
            print('                        </variant>
                    </seq_changes>
                </variant>
            </seq_changes>' . "\n");
        }



        // Location (genomic, if available).
        if (!empty($aGenes[$zVar['symbol']]['chromosome']) && !empty($zVar['g_position_start'])) {
            print('            <location>
                <ref_seq source="genbank" accession="' . $_SETT['human_builds'][$aGenes[$zVar['symbol']]['refseq_build']]['ncbi_sequences'][$aGenes[$zVar['symbol']]['chromosome']] . '" name="' . $_SETT['human_builds'][$aGenes[$zVar['symbol']]['refseq_build']]['ncbi_name'] . '"/>
                <chr>' . $aGenes[$zVar['symbol']]['chromosome'] . '</chr>
                <start>' . $zVar['g_position_start'] . '</start>
                <end>' . $zVar['g_position_end'] . '</end>
            </location>' . "\n");
        } elseif ($aGenes[$zVar['symbol']]['refseq_mrna'] && !empty($zVar['c_position_start'])) {
            // FIXME; use lovd_convertDNAPositionToHR() ???
            print('            <location>
                <ref_seq source="genbank" accession="' . $aGenes[$zVar['symbol']]['refseq_mrna'] . '"/>
                <chr>' . $aGenes[$zVar['symbol']]['chromosome'] . '</chr>
                <start>' . $zVar['c_position_start'] . (empty($zVar['c_position_start_intron'])? '' : ($zVar['c_position_start_intron'] > 0? '+' : '') . $zVar['c_position_start_intron']) . '</start>
                <end>' . $zVar['c_position_end'] . (empty($zVar['c_position_end_intron'])? '' : ($zVar['c_position_end_intron'] > 0? '+' : '') . $zVar['c_position_end_intron']) . '</end>
            </location>' . "\n");
        }

        // Creation && Modification dates.
        print('            <creation_date>' . $_FEED->formatDate($zVar['created_date']) . '</creation_date>' . "\n");
        if ($zVar['edited_date']) {
            print('            <modification_date>' . $_FEED->formatDate($zVar['edited_date']) . '</modification_date>' . "\n");
        }

        // Close variant.
        print('        </variant>' . "\n");
    }



    // Finish Individual part of XML.
    if ($nPrevID != $zPat['patientid']) {
        // Reference.
        if (!empty($zPat['Patient/Reference'])) {
            // Try to parse PubMed custom link.
            // FIXME; allow for multiple references!
            $aRef = lovd_parsePubMed($zPat['Patient/Reference']);
            if (is_array($aRef)) {
                // Successfully parsed.
                print('        <db_xref accession="' . $aRef['id'] . '" source="PubMed" uri="' . $aRef['url'] . '">
            <comment term="author">
                <text>' . $aRef['author'] . '</text>
            </comment>
        </db_xref>' . "\n");
            } else {
                print('        <comment term="reference">
            <text>' . $zPat['Patient/Reference'] . '</text>
        </comment>' . "\n");
            }
        }

        // Comment.
        if (!empty($zPat['Patient/Remarks'])) {
            print('        <comment term="remark">
            <text>' . $zPat['Patient/Remarks'] . '</text>
        </comment>' . "\n");
        }

        // Creation && Modification dates.
        print('        <creation_date>' . $_FEED->formatDate($zPat['created_date']) . '</creation_date>' . "\n");
        if ($zPat['edited_date']) {
            print('        <modification_date>' . $_FEED->formatDate($zPat['edited_date']) . '</modification_date>' . "\n");
        }

        $nPrevID = $zPat['patientid'];
    }
}
if ($n) {
    // Close individual.
    print('    </individual>' . "\n");
}
print('</lsdb>' . "\n");
exit;
?>
