{
  "release": {
    "date": "2026-05-28",
    "id": "235",
    "name": "VOGDB release 235",
    "data_source": "NCBI Refseq release 235",
    "url": "https://fileshare.csb.univie.ac.at/vog/vog235",
    "license": "All data published are licensed under CC BY 4.0 (https://creativecommons.org/licenses/by/4.0/)",
    "authors": "Lovro Trgovec-Greif, Hans-Joerg Hellinger, Jean Mainguy, Alexander Pfundner, Dmitrij Frishman, Michael Kiening, Nicole Webster, Patrick Laffy, Thomas Rattei",
    "contact": "Thomas Rattei, Centre for Microbiology and Environmental Systems Science, University of Vienna, Austria, thomas.rattei@univie.ac.at",
    "number_proteins": 719983,
    "number_genomes": 15646
  },
  "groups": [
    {
      "name": "vfam",
      "description": "Virus protein families (built from vogs by HMM-HMM clustering)",
      "number": 39808,
      "summary": "Number of VFAM: 39808 (Virus protein families)"
    },
    {
      "name": "vog",
      "description": "Virus orthologous groups (built from bidirectional sequence similarities)",
      "number": 48646,
      "summary": "Number of VOG: 48646 (Virus orthologous groups)"
    },
    {
      "name": "vfold",
      "description": "Virus protein structural folds (built from vfams by clustering of predicted 3D structures of representative proteins)",
      "number": 33168,
      "summary": "Number of VFOLD: 33168 (Virus protein structural folds)"
    }
  ],
  "files": [
    {
      "name": "vog.raw_algs.alistat.txt",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vog.raw_algs.alistat.txt",
      "description": "Statistics of multiple alignments according to minimum reporting standard for multiple sequence alignments (https://doi.org/10.1093/nargab/lqaa024).",
      "md5sum": "85aadc23a1fc9d4450b1d6ed4dd155dd",
      "bytes": 3271162,
      "url_label": "vog.raw_algs.alistat.txt (Statistics of multiple alignments): 3,271,162 bytes, MD5 checksum 85aadc23a1fc9d4450b1d6ed4dd155dd"
    },
    {
      "name": "vog.annotations.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vog.annotations.tsv.gz",
      "description": "Tab separated file of groups and their consensus functional annotations (preferrably from Swissprot annotations, if not available then the annotations from RefSeq were used). Columns: GroupName|ProteinCount|SpeciesCount|FunctionalCategory|ConsensusFunctionalDescription",
      "md5sum": "1f6481d5b4fa82ec716caa142a638190",
      "bytes": 371457,
      "url_label": "vog.annotations.tsv.gz (Funcational annotations of groups): 371,457 bytes, MD5 checksum 1f6481d5b4fa82ec716caa142a638190"
    },
    {
      "name": "vfam.members.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vfam.members.tsv.gz",
      "description": "Tab separated file of VOGs and the comma separated lists of their member protein ids. Columns: GroupName|ProteinCount|SpeciesCount|FunctionalCategory|ProteinIDs",
      "md5sum": "9c65bd0b20f3782c5e56659d1df839e8",
      "bytes": 4550166,
      "url_label": "vfam.members.tsv.gz (Member protein ids of groups): 4,550,166 bytes, MD5 checksum 9c65bd0b20f3782c5e56659d1df839e8"
    },
    {
      "name": "vfam.virusonly.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vfam.virusonly.tsv.gz",
      "description": "Tab separated file of VOGs and their specificic occurrence in virus genomes. For this purpose the homology of all member proteins to cellular genomes from eggNOG 4.5 have been determined with three different stringencies: High stringency: blastp e-Value <=1e-04 and hits in maximal 2 cellular genomes; Medium stringency: blastp e-Value <=1e-10 and hits in maximal 3 cellular genomes; Low stringency: blastp e-Value <=1e-15 and hits in maximal 4 cellular genomes; The column Only_in_viruses has been set true if members matched not more than the maximal number of genomes at the e-Value threshold for each stringency level. Columns: GroupName|Only in viruses (high stringency)|Only in viruses (medium stringency)|Only in viruses (low stringency) 1=True; 0=False. This file is useful to extract virus-specific markers from all VOGs, based on your preferred level of stringency.",
      "md5sum": "5fa26182051112b06a931bae3aae2df1",
      "bytes": 105200,
      "url_label": "vfam.virusonly.tsv.gz (Specificity if groups to Viruses): 105,200 bytes, MD5 checksum 5fa26182051112b06a931bae3aae2df1"
    },
    {
      "name": "vfam.lca.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vfam.lca.tsv.gz",
      "description": "Tab separated file of VOGs and the taxonomic lineage of the last common aencestor (LCA) of member genomes. Genomes with unclassified taxonomic lineages have not been used for LCA determination, which can result in VOG without lca (if all proteins of a VOG are from unclassified lineages). The numbers of genomes per VOG and LCA, as well as the total numbers of genomes in the LCA are given. Columns: GroupName|GenomesInGroupAndLCA|GenomesTotalInLCA|LastCommonAncestor_TaxonName|LastCommonAncestor_TaxonID",
      "md5sum": "b78878d70feb20e79a3069fc5c830d92",
      "bytes": 497395,
      "url_label": "vfam.lca.tsv.gz (Last common aencestors of groups): 497,395 bytes, MD5 checksum b78878d70feb20e79a3069fc5c830d92"
    },
    {
      "name": "vog.hmm.tar.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vog.hmm.tar.gz",
      "description": "Compressed archive of the HMMER3 compatible Hidden Markov Models obtained from the multiple sequence alignments for each vogdb group.",
      "md5sum": "ef92a4c880126ea3b86e52fdf902ba06",
      "bytes": 572994391,
      "url_label": "vog.hmm.tar.gz (Hidden Markov Models of groups): 572,994,391 bytes, MD5 checksum ef92a4c880126ea3b86e52fdf902ba06"
    },
    {
      "name": "vfam.annotations.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vfam.annotations.tsv.gz",
      "description": "Tab separated file of groups and their consensus functional annotations (preferrably from Swissprot annotations, if not available then the annotations from RefSeq were used). Columns: GroupName|ProteinCount|SpeciesCount|FunctionalCategory|ConsensusFunctionalDescription",
      "md5sum": "20c8ffdc7a754f5ae224516faf84bd12",
      "bytes": 294137,
      "url_label": "vfam.annotations.tsv.gz (Funcational annotations of groups): 294,137 bytes, MD5 checksum 20c8ffdc7a754f5ae224516faf84bd12"
    },
    {
      "name": "vog.faa.tar.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vog.faa.tar.gz",
      "description": "Compressed archive of FASTA formatted files of the proteins per vogdb group.",
      "md5sum": "f40d8a1ac3b02c8b14a8b70c7d910d47",
      "bytes": 66220480,
      "url_label": "vog.faa.tar.gz (Protein sequences of groups): 66,220,480 bytes, MD5 checksum f40d8a1ac3b02c8b14a8b70c7d910d47"
    },
    {
      "name": "vogdb.host.txt",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vogdb.host.txt",
      "description": "Tab separated file of host information and classification for virus taxa. Columns: taxon id|phage/nonphage|host|superkingdom of host.",
      "md5sum": "8db6ce183779d105ce78acdfc0f2652a",
      "bytes": 482633,
      "url_label": "vogdb.host.txt (Host information and classification for genomes): 482,633 bytes, MD5 checksum 8db6ce183779d105ce78acdfc0f2652a"
    },
    {
      "name": "vfold.faa.tar.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vfold.faa.tar.gz",
      "description": "Compressed archive of FASTA formatted files of the proteins per vogdb group.",
      "md5sum": "770348ee822b1e166f9c5f1b79d40130",
      "bytes": 64110540,
      "url_label": "vfold.faa.tar.gz (Protein sequences of groups): 64,110,540 bytes, MD5 checksum 770348ee822b1e166f9c5f1b79d40130"
    },
    {
      "name": "vfam.raw_algs.tar.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vfam.raw_algs.tar.gz",
      "description": "Compressed archive of multiple sequence alignments for each VOGDB group.",
      "md5sum": "713cdb47d5428f9b1c0984bb7a8c8ddf",
      "bytes": 62847087,
      "url_label": "vfam.raw_algs.tar.gz (Multiple sequence alignments of groups): 62,847,087 bytes, MD5 checksum 713cdb47d5428f9b1c0984bb7a8c8ddf"
    },
    {
      "name": "vog.members.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vog.members.tsv.gz",
      "description": "Tab separated file of VOGs and the comma separated lists of their member protein ids. Columns: GroupName|ProteinCount|SpeciesCount|FunctionalCategory|ProteinIDs",
      "md5sum": "6e4789ecd517f7f0f6a4e8433486e1e2",
      "bytes": 4631415,
      "url_label": "vog.members.tsv.gz (Member protein ids of groups): 4,631,415 bytes, MD5 checksum 6e4789ecd517f7f0f6a4e8433486e1e2"
    },
    {
      "name": "vfold.members.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vfold.members.tsv.gz",
      "description": "Tab separated file of VOGs and the comma separated lists of their member protein ids. Columns: GroupName|ProteinCount|SpeciesCount|FunctionalCategory|ProteinIDs",
      "md5sum": "432de22ded6b124bafa74fab883d7ccf",
      "bytes": 4515230,
      "url_label": "vfold.members.tsv.gz (Member protein ids of groups): 4,515,230 bytes, MD5 checksum 432de22ded6b124bafa74fab883d7ccf"
    },
    {
      "name": "vfam.hmm.tar.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vfam.hmm.tar.gz",
      "description": "Compressed archive of the HMMER3 compatible Hidden Markov Models obtained from the multiple sequence alignments for each vogdb group.",
      "md5sum": "ed7e43c877c1f6201f8c5fa4f51698e7",
      "bytes": 463359936,
      "url_label": "vfam.hmm.tar.gz (Hidden Markov Models of groups): 463,359,936 bytes, MD5 checksum ed7e43c877c1f6201f8c5fa4f51698e7"
    },
    {
      "name": "vfold.annotations.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vfold.annotations.tsv.gz",
      "description": "Tab separated file of groups and their consensus functional annotations (preferrably from Swissprot annotations, if not available then the annotations from RefSeq were used). Columns: GroupName|ProteinCount|SpeciesCount|FunctionalCategory|ConsensusFunctionalDescription",
      "md5sum": "1a8f3c89470a2b59172432682ce6db58",
      "bytes": 243224,
      "url_label": "vfold.annotations.tsv.gz (Funcational annotations of groups): 243,224 bytes, MD5 checksum 1a8f3c89470a2b59172432682ce6db58"
    },
    {
      "name": "vfam.representatives.colabfold_predictions.tar.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vfam.representatives.colabfold_predictions.tar.gz",
      "description": "Colabfold predictions of protein structures, represented by best ranked PDB and score files.",
      "md5sum": "44d9e30f77f6062e43fa5f19c6ec16f3",
      "bytes": 7834407189,
      "url_label": "vfam.representatives.colabfold_predictions.tar.gz (Protein structure predictions): 7,834,407,189 bytes, MD5 checksum 44d9e30f77f6062e43fa5f19c6ec16f3"
    },
    {
      "name": "vogdb.proteins.all.fa.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vogdb.proteins.all.fa.gz",
      "description": "FASTA formatted file of all proteins from the genomes in vog.species.list. Protein IDs encode the taxonomy id of the genome and the RefSeq protein id. For peptides from polyproteins also the corresponding protein id of the polyprotein (CDS) is given.",
      "md5sum": "cbe52bc431a17f3151a4be425c825a0d",
      "bytes": 106448839,
      "url_label": "vogdb.proteins.all.fa.gz (Protein sequences of all genomes): 106,448,839 bytes, MD5 checksum cbe52bc431a17f3151a4be425c825a0d"
    },
    {
      "name": "vogdb.genes.all.fa.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vogdb.genes.all.fa.gz",
      "description": "FASTA formatted file of all gene sequences from the genomes in vog.species.list. Same IDs as in the protein file are used. For polyprotein genes the partial gene sequences of the peptides as well as the complete gene sequences of the polyprotein are contained.",
      "md5sum": "80dbfedab373421dc51025e28f6f046e",
      "bytes": 171387463,
      "url_label": "vogdb.genes.all.fa.gz (Gene sequences of all genomes): 171,387,463 bytes, MD5 checksum 80dbfedab373421dc51025e28f6f046e"
    },
    {
      "name": "vfam.raw_algs.alistat.txt",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vfam.raw_algs.alistat.txt",
      "description": "Statistics of multiple alignments according to minimum reporting standard for multiple sequence alignments (https://doi.org/10.1093/nargab/lqaa024).",
      "md5sum": "1d95525e583d7f5f0fe9a469c970ed49",
      "bytes": 2720802,
      "url_label": "vfam.raw_algs.alistat.txt (Statistics of multiple alignments): 2,720,802 bytes, MD5 checksum 1d95525e583d7f5f0fe9a469c970ed49"
    },
    {
      "name": "vfam.representatives.colabfold_mean_plddt.txt",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vfam.representatives.colabfold_mean_plddt.txt",
      "description": "Mean pLDDT values for colabfold predictions of protein structures.",
      "md5sum": "2459f9050db3ac3496c5ea0913c3a494",
      "bytes": 635888,
      "url_label": "vfam.representatives.colabfold_mean_plddt.txt (Mean pLDDT values of protein structure predictions): 635,888 bytes, MD5 checksum 2459f9050db3ac3496c5ea0913c3a494"
    },
    {
      "name": "vog.virusonly.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vog.virusonly.tsv.gz",
      "description": "Tab separated file of VOGs and their specificic occurrence in virus genomes. For this purpose the homology of all member proteins to cellular genomes from eggNOG 4.5 have been determined with three different stringencies: High stringency: blastp e-Value <=1e-04 and hits in maximal 2 cellular genomes; Medium stringency: blastp e-Value <=1e-10 and hits in maximal 3 cellular genomes; Low stringency: blastp e-Value <=1e-15 and hits in maximal 4 cellular genomes; The column Only_in_viruses has been set true if members matched not more than the maximal number of genomes at the e-Value threshold for each stringency level. Columns: GroupName|Only in viruses (high stringency)|Only in viruses (medium stringency)|Only in viruses (low stringency) 1=True; 0=False. This file is useful to extract virus-specific markers from all VOGs, based on your preferred level of stringency.",
      "md5sum": "6976c520c9b0ebaccaad75e2fe5d95df",
      "bytes": 126237,
      "url_label": "vog.virusonly.tsv.gz (Specificity if groups to Viruses): 126,237 bytes, MD5 checksum 6976c520c9b0ebaccaad75e2fe5d95df"
    },
    {
      "name": "vog.raw_algs.tar.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vog.raw_algs.tar.gz",
      "description": "Compressed archive of multiple sequence alignments for each VOGDB group.",
      "md5sum": "676e286a49d6b1716cd3a0f6e1d1c2d9",
      "bytes": 61847213,
      "url_label": "vog.raw_algs.tar.gz (Multiple sequence alignments of groups): 61,847,213 bytes, MD5 checksum 676e286a49d6b1716cd3a0f6e1d1c2d9"
    },
    {
      "name": "vfold.lca.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vfold.lca.tsv.gz",
      "description": "Tab separated file of VOGs and the taxonomic lineage of the last common aencestor (LCA) of member genomes. Genomes with unclassified taxonomic lineages have not been used for LCA determination, which can result in VOG without lca (if all proteins of a VOG are from unclassified lineages). The numbers of genomes per VOG and LCA, as well as the total numbers of genomes in the LCA are given. Columns: GroupName|GenomesInGroupAndLCA|GenomesTotalInLCA|LastCommonAncestor_TaxonName|LastCommonAncestor_TaxonID",
      "md5sum": "bc051ddfd1f80ed8d99bafddcde797e9",
      "bytes": 412625,
      "url_label": "vfold.lca.tsv.gz (Last common aencestors of groups): 412,625 bytes, MD5 checksum bc051ddfd1f80ed8d99bafddcde797e9"
    },
    {
      "name": "vogdb.functional_categories.txt",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vogdb.functional_categories.txt",
      "description": "Text file listing the lettercodes of functional categories. These consist of X (unused in NCBI COG functional categories), followed by a lower case character indicating the functional category.",
      "md5sum": "6b816cc49c17d0095da91bad4e7552fa",
      "bytes": 308,
      "url_label": "vogdb.functional_categories.txt (Lettercodes of functional categories): 308 bytes, MD5 checksum 6b816cc49c17d0095da91bad4e7552fa"
    },
    {
      "name": "vfold.virusonly.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vfold.virusonly.tsv.gz",
      "description": "Tab separated file of VOGs and their specificic occurrence in virus genomes. For this purpose the homology of all member proteins to cellular genomes from eggNOG 4.5 have been determined with three different stringencies: High stringency: blastp e-Value <=1e-04 and hits in maximal 2 cellular genomes; Medium stringency: blastp e-Value <=1e-10 and hits in maximal 3 cellular genomes; Low stringency: blastp e-Value <=1e-15 and hits in maximal 4 cellular genomes; The column Only_in_viruses has been set true if members matched not more than the maximal number of genomes at the e-Value threshold for each stringency level. Columns: GroupName|Only in viruses (high stringency)|Only in viruses (medium stringency)|Only in viruses (low stringency) 1=True; 0=False. This file is useful to extract virus-specific markers from all VOGs, based on your preferred level of stringency.",
      "md5sum": "784cd9b82e399a2f55cf9c20d2ebb058",
      "bytes": 86919,
      "url_label": "vfold.virusonly.tsv.gz (Specificity if groups to Viruses): 86,919 bytes, MD5 checksum 784cd9b82e399a2f55cf9c20d2ebb058"
    },
    {
      "name": "vog.lca.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vog.lca.tsv.gz",
      "description": "Tab separated file of VOGs and the taxonomic lineage of the last common aencestor (LCA) of member genomes. Genomes with unclassified taxonomic lineages have not been used for LCA determination, which can result in VOG without lca (if all proteins of a VOG are from unclassified lineages). The numbers of genomes per VOG and LCA, as well as the total numbers of genomes in the LCA are given. Columns: GroupName|GenomesInGroupAndLCA|GenomesTotalInLCA|LastCommonAncestor_TaxonName|LastCommonAncestor_TaxonID",
      "md5sum": "3a785ed8e7f61ea264b00326e5998f0b",
      "bytes": 615054,
      "url_label": "vog.lca.tsv.gz (Last common aencestors of groups): 615,054 bytes, MD5 checksum 3a785ed8e7f61ea264b00326e5998f0b"
    },
    {
      "name": "vogdb.species.txt",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vogdb.species.txt",
      "description": "Tab separated file of virus genomes used for VOG construction. Columns: species name|taxon id|source|source version",
      "md5sum": "62f5793436c6f263b9827e76f00faa2e",
      "bytes": 815682,
      "url_label": "vogdb.species.txt (Virus genomes used for VOG construction): 815,682 bytes, MD5 checksum 62f5793436c6f263b9827e76f00faa2e"
    },
    {
      "name": "vogdb.taxonomy.krona.html",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vogdb.taxonomy.krona.html",
      "description": "Interactive chart of virus genome taxonomies.",
      "md5sum": "125eb083dc816556c945964907cecc93",
      "bytes": 6933398,
      "url_label": "vogdb.taxonomy.krona.html (Distribution of virus genome taxonomies): 6,933,398 bytes, MD5 checksum 125eb083dc816556c945964907cecc93"
    },
    {
      "name": "vfam.faa.tar.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog235/vfam.faa.tar.gz",
      "description": "Compressed archive of FASTA formatted files of the proteins per vogdb group.",
      "md5sum": "45195e9f8a037f50ea7770bedfcb5c54",
      "bytes": 65384759,
      "url_label": "vfam.faa.tar.gz (Protein sequences of groups): 65,384,759 bytes, MD5 checksum 45195e9f8a037f50ea7770bedfcb5c54"
    }
  ]
}
