{
  "release": {
    "date": "2026-02-25",
    "id": "233",
    "name": "VOGDB release 233",
    "data_source": "NCBI Refseq release 233",
    "url": "https://fileshare.csb.univie.ac.at/vog/vog233",
    "license": "All data published are licensed under CC BY 4.0 (https://creativecommons.org/licenses/by/4.0/)",
    "authors": "Lovro Trgovec-Greif, Hans-Joerg Hellinger, Jean Mainguy, Alexander Pfundner, Dmitrij Frishman, Michael Kiening, Nicole Webster, Patrick Laffy, Thomas Rattei",
    "contact": "Thomas Rattei, Centre for Microbiology and Environmental Systems Science, University of Vienna, Austria, thomas.rattei@univie.ac.at",
    "number_proteins": 712553,
    "number_genomes": 15251
  },
  "groups": [
    {
      "name": "vfam",
      "description": "Virus protein families (built from vogs by HMM-HMM clustering)",
      "number": 39975,
      "summary": "Number of VFAM: 39975 (Virus protein families)"
    },
    {
      "name": "vog",
      "description": "Virus orthologous groups (built from bidirectional sequence similarities)",
      "number": 48870,
      "summary": "Number of VOG: 48870 (Virus orthologous groups)"
    },
    {
      "name": "vfold",
      "description": "Virus protein structural folds (built from vfams by clustering of predicted 3D structures of representative proteins)",
      "number": 33351,
      "summary": "Number of VFOLD: 33351 (Virus protein structural folds)"
    }
  ],
  "files": [
    {
      "name": "vog.raw_algs.alistat.txt",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vog.raw_algs.alistat.txt",
      "description": "Statistics of multiple alignments according to minimum reporting standard for multiple sequence alignments (https://doi.org/10.1093/nargab/lqaa024).",
      "md5sum": "c9989df9950c7e46bd5665f1364d6e6c",
      "bytes": 3275692,
      "url_label": "vog.raw_algs.alistat.txt (Statistics of multiple alignments): 3,275,692 bytes, MD5 checksum c9989df9950c7e46bd5665f1364d6e6c"
    },
    {
      "name": "vog.annotations.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vog.annotations.tsv.gz",
      "description": "Tab separated file of groups and their consensus functional annotations (preferrably from Swissprot annotations, if not available then the annotations from RefSeq were used). Columns: GroupName|ProteinCount|SpeciesCount|FunctionalCategory|ConsensusFunctionalDescription",
      "md5sum": "9c026f506b4d3c8ae976953f751d8df9",
      "bytes": 372611,
      "url_label": "vog.annotations.tsv.gz (Funcational annotations of groups): 372,611 bytes, MD5 checksum 9c026f506b4d3c8ae976953f751d8df9"
    },
    {
      "name": "vfam.members.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vfam.members.tsv.gz",
      "description": "Tab separated file of VOGs and the comma separated lists of their member protein ids. Columns: GroupName|ProteinCount|SpeciesCount|FunctionalCategory|ProteinIDs",
      "md5sum": "8b77ff081d3a6693c4b1beeb985b2eeb",
      "bytes": 4482163,
      "url_label": "vfam.members.tsv.gz (Member protein ids of groups): 4,482,163 bytes, MD5 checksum 8b77ff081d3a6693c4b1beeb985b2eeb"
    },
    {
      "name": "vfam.virusonly.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vfam.virusonly.tsv.gz",
      "description": "Tab separated file of VOGs and their specificic occurrence in virus genomes. For this purpose the homology of all member proteins to cellular genomes from eggNOG 4.5 have been determined with three different stringencies: High stringency: blastp e-Value <=1e-04 and hits in maximal 2 cellular genomes; Medium stringency: blastp e-Value <=1e-10 and hits in maximal 3 cellular genomes; Low stringency: blastp e-Value <=1e-15 and hits in maximal 4 cellular genomes; The column Only_in_viruses has been set true if members matched not more than the maximal number of genomes at the e-Value threshold for each stringency level. Columns: GroupName|Only in viruses (high stringency)|Only in viruses (medium stringency)|Only in viruses (low stringency) 1=True; 0=False. This file is useful to extract virus-specific markers from all VOGs, based on your preferred level of stringency.",
      "md5sum": "9baf698714d25d413a5a57484c6193bb",
      "bytes": 105454,
      "url_label": "vfam.virusonly.tsv.gz (Specificity if groups to Viruses): 105,454 bytes, MD5 checksum 9baf698714d25d413a5a57484c6193bb"
    },
    {
      "name": "vfam.lca.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vfam.lca.tsv.gz",
      "description": "Tab separated file of VOGs and the taxonomic lineage of the last common aencestor (LCA) of member genomes. Genomes with unclassified taxonomic lineages have not been used for LCA determination, which can result in VOG without lca (if all proteins of a VOG are from unclassified lineages). The numbers of genomes per VOG and LCA, as well as the total numbers of genomes in the LCA are given. Columns: GroupName|GenomesInGroupAndLCA|GenomesTotalInLCA|LastCommonAncestor_TaxonName|LastCommonAncestor_TaxonID",
      "md5sum": "9a370a760752debff0cb477885b305f9",
      "bytes": 481829,
      "url_label": "vfam.lca.tsv.gz (Last common aencestors of groups): 481,829 bytes, MD5 checksum 9a370a760752debff0cb477885b305f9"
    },
    {
      "name": "vog.hmm.tar.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vog.hmm.tar.gz",
      "description": "Compressed archive of the HMMER3 compatible Hidden Markov Models obtained from the multiple sequence alignments for each vogdb group.",
      "md5sum": "ebedf027930d9a08cb42c719a49a91d7",
      "bytes": 567041630,
      "url_label": "vog.hmm.tar.gz (Hidden Markov Models of groups): 567,041,630 bytes, MD5 checksum ebedf027930d9a08cb42c719a49a91d7"
    },
    {
      "name": "vfam.annotations.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vfam.annotations.tsv.gz",
      "description": "Tab separated file of groups and their consensus functional annotations (preferrably from Swissprot annotations, if not available then the annotations from RefSeq were used). Columns: GroupName|ProteinCount|SpeciesCount|FunctionalCategory|ConsensusFunctionalDescription",
      "md5sum": "f3ad23daf30b0d2773a321228507155c",
      "bytes": 295486,
      "url_label": "vfam.annotations.tsv.gz (Funcational annotations of groups): 295,486 bytes, MD5 checksum f3ad23daf30b0d2773a321228507155c"
    },
    {
      "name": "vog.faa.tar.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vog.faa.tar.gz",
      "description": "Compressed archive of FASTA formatted files of the proteins per vogdb group.",
      "md5sum": "d939c2d9492dffc57c654c17300354c1",
      "bytes": 64843583,
      "url_label": "vog.faa.tar.gz (Protein sequences of groups): 64,843,583 bytes, MD5 checksum d939c2d9492dffc57c654c17300354c1"
    },
    {
      "name": "vogdb.host.txt",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vogdb.host.txt",
      "description": "Tab separated file of host information and classification for virus taxa. Columns: taxon id|phage/nonphage|host|superkingdom of host.",
      "md5sum": "33eab4d6c347ae04c57aca27a43d61e1",
      "bytes": 470314,
      "url_label": "vogdb.host.txt (Host information and classification for genomes): 470,314 bytes, MD5 checksum 33eab4d6c347ae04c57aca27a43d61e1"
    },
    {
      "name": "vfold.faa.tar.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vfold.faa.tar.gz",
      "description": "Compressed archive of FASTA formatted files of the proteins per vogdb group.",
      "md5sum": "8bc9eac2359e27ebc6978bc8a10faa2b",
      "bytes": 62779721,
      "url_label": "vfold.faa.tar.gz (Protein sequences of groups): 62,779,721 bytes, MD5 checksum 8bc9eac2359e27ebc6978bc8a10faa2b"
    },
    {
      "name": "vfam.raw_algs.tar.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vfam.raw_algs.tar.gz",
      "description": "Compressed archive of multiple sequence alignments for each VOGDB group.",
      "md5sum": "d41b56cbaed24e4b1968b17c57f676e7",
      "bytes": 61408483,
      "url_label": "vfam.raw_algs.tar.gz (Multiple sequence alignments of groups): 61,408,483 bytes, MD5 checksum d41b56cbaed24e4b1968b17c57f676e7"
    },
    {
      "name": "vog.members.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vog.members.tsv.gz",
      "description": "Tab separated file of VOGs and the comma separated lists of their member protein ids. Columns: GroupName|ProteinCount|SpeciesCount|FunctionalCategory|ProteinIDs",
      "md5sum": "b861867c08f48e278defb44b476f460f",
      "bytes": 4567453,
      "url_label": "vog.members.tsv.gz (Member protein ids of groups): 4,567,453 bytes, MD5 checksum b861867c08f48e278defb44b476f460f"
    },
    {
      "name": "vfold.members.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vfold.members.tsv.gz",
      "description": "Tab separated file of VOGs and the comma separated lists of their member protein ids. Columns: GroupName|ProteinCount|SpeciesCount|FunctionalCategory|ProteinIDs",
      "md5sum": "913eba92b5303d7d614208a4467c50cd",
      "bytes": 4441152,
      "url_label": "vfold.members.tsv.gz (Member protein ids of groups): 4,441,152 bytes, MD5 checksum 913eba92b5303d7d614208a4467c50cd"
    },
    {
      "name": "vfam.hmm.tar.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vfam.hmm.tar.gz",
      "description": "Compressed archive of the HMMER3 compatible Hidden Markov Models obtained from the multiple sequence alignments for each vogdb group.",
      "md5sum": "8ebb9d6eb001c35383bbd9c5a7fc668d",
      "bytes": 461508597,
      "url_label": "vfam.hmm.tar.gz (Hidden Markov Models of groups): 461,508,597 bytes, MD5 checksum 8ebb9d6eb001c35383bbd9c5a7fc668d"
    },
    {
      "name": "vfold.annotations.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vfold.annotations.tsv.gz",
      "description": "Tab separated file of groups and their consensus functional annotations (preferrably from Swissprot annotations, if not available then the annotations from RefSeq were used). Columns: GroupName|ProteinCount|SpeciesCount|FunctionalCategory|ConsensusFunctionalDescription",
      "md5sum": "f2d411483673dc3bc6624962c83291ed",
      "bytes": 244844,
      "url_label": "vfold.annotations.tsv.gz (Funcational annotations of groups): 244,844 bytes, MD5 checksum f2d411483673dc3bc6624962c83291ed"
    },
    {
      "name": "vfam.representatives.colabfold_predictions.tar.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vfam.representatives.colabfold_predictions.tar.gz",
      "description": "Colabfold predictions of protein structures, represented by best ranked PDB and score files.",
      "md5sum": "6e58c3e365bfc23f112d36487525a94a",
      "bytes": 7751723340,
      "url_label": "vfam.representatives.colabfold_predictions.tar.gz (Protein structure predictions): 7,751,723,340 bytes, MD5 checksum 6e58c3e365bfc23f112d36487525a94a"
    },
    {
      "name": "vogdb.proteins.all.fa.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vogdb.proteins.all.fa.gz",
      "description": "FASTA formatted file of all proteins from the genomes in vog.species.list. Protein IDs encode the taxonomy id of the genome and the RefSeq protein id. For peptides from polyproteins also the corresponding protein id of the polyprotein (CDS) is given.",
      "md5sum": "9eafc5d43d1baa07d5177a7ffa7b17c0",
      "bytes": 105515219,
      "url_label": "vogdb.proteins.all.fa.gz (Protein sequences of all genomes): 105,515,219 bytes, MD5 checksum 9eafc5d43d1baa07d5177a7ffa7b17c0"
    },
    {
      "name": "vogdb.genes.all.fa.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vogdb.genes.all.fa.gz",
      "description": "FASTA formatted file of all gene sequences from the genomes in vog.species.list. Same IDs as in the protein file are used. For polyprotein genes the partial gene sequences of the peptides as well as the complete gene sequences of the polyprotein are contained.",
      "md5sum": "3ab17122f5a2af8638d6c7c1c9f44703",
      "bytes": 169745636,
      "url_label": "vogdb.genes.all.fa.gz (Gene sequences of all genomes): 169,745,636 bytes, MD5 checksum 3ab17122f5a2af8638d6c7c1c9f44703"
    },
    {
      "name": "vfam.raw_algs.alistat.txt",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vfam.raw_algs.alistat.txt",
      "description": "Statistics of multiple alignments according to minimum reporting standard for multiple sequence alignments (https://doi.org/10.1093/nargab/lqaa024).",
      "md5sum": "9eb797738ebf2ab33971c3f426bf9d16",
      "bytes": 2726773,
      "url_label": "vfam.raw_algs.alistat.txt (Statistics of multiple alignments): 2,726,773 bytes, MD5 checksum 9eb797738ebf2ab33971c3f426bf9d16"
    },
    {
      "name": "vfam.representatives.colabfold_mean_plddt.txt",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vfam.representatives.colabfold_mean_plddt.txt",
      "description": "Mean pLDDT values for colabfold predictions of protein structures.",
      "md5sum": "5ce051d7d9137ac4d19e3479017a8293",
      "bytes": 638576,
      "url_label": "vfam.representatives.colabfold_mean_plddt.txt (Mean pLDDT values of protein structure predictions): 638,576 bytes, MD5 checksum 5ce051d7d9137ac4d19e3479017a8293"
    },
    {
      "name": "vog.virusonly.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vog.virusonly.tsv.gz",
      "description": "Tab separated file of VOGs and their specificic occurrence in virus genomes. For this purpose the homology of all member proteins to cellular genomes from eggNOG 4.5 have been determined with three different stringencies: High stringency: blastp e-Value <=1e-04 and hits in maximal 2 cellular genomes; Medium stringency: blastp e-Value <=1e-10 and hits in maximal 3 cellular genomes; Low stringency: blastp e-Value <=1e-15 and hits in maximal 4 cellular genomes; The column Only_in_viruses has been set true if members matched not more than the maximal number of genomes at the e-Value threshold for each stringency level. Columns: GroupName|Only in viruses (high stringency)|Only in viruses (medium stringency)|Only in viruses (low stringency) 1=True; 0=False. This file is useful to extract virus-specific markers from all VOGs, based on your preferred level of stringency.",
      "md5sum": "3f53ff04da822cd0824dc6049e8c4127",
      "bytes": 126917,
      "url_label": "vog.virusonly.tsv.gz (Specificity if groups to Viruses): 126,917 bytes, MD5 checksum 3f53ff04da822cd0824dc6049e8c4127"
    },
    {
      "name": "vog.raw_algs.tar.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vog.raw_algs.tar.gz",
      "description": "Compressed archive of multiple sequence alignments for each VOGDB group.",
      "md5sum": "050217b0bd5737b648eb80f3fc041b3f",
      "bytes": 60389774,
      "url_label": "vog.raw_algs.tar.gz (Multiple sequence alignments of groups): 60,389,774 bytes, MD5 checksum 050217b0bd5737b648eb80f3fc041b3f"
    },
    {
      "name": "vfold.lca.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vfold.lca.tsv.gz",
      "description": "Tab separated file of VOGs and the taxonomic lineage of the last common aencestor (LCA) of member genomes. Genomes with unclassified taxonomic lineages have not been used for LCA determination, which can result in VOG without lca (if all proteins of a VOG are from unclassified lineages). The numbers of genomes per VOG and LCA, as well as the total numbers of genomes in the LCA are given. Columns: GroupName|GenomesInGroupAndLCA|GenomesTotalInLCA|LastCommonAncestor_TaxonName|LastCommonAncestor_TaxonID",
      "md5sum": "237e5c59b00b2ad321d6f70bd457d7fe",
      "bytes": 399308,
      "url_label": "vfold.lca.tsv.gz (Last common aencestors of groups): 399,308 bytes, MD5 checksum 237e5c59b00b2ad321d6f70bd457d7fe"
    },
    {
      "name": "vogdb.functional_categories.txt",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vogdb.functional_categories.txt",
      "description": "Text file listing the lettercodes of functional categories. These consist of X (unused in NCBI COG functional categories), followed by a lower case character indicating the functional category.",
      "md5sum": "6b816cc49c17d0095da91bad4e7552fa",
      "bytes": 308,
      "url_label": "vogdb.functional_categories.txt (Lettercodes of functional categories): 308 bytes, MD5 checksum 6b816cc49c17d0095da91bad4e7552fa"
    },
    {
      "name": "vfold.virusonly.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vfold.virusonly.tsv.gz",
      "description": "Tab separated file of VOGs and their specificic occurrence in virus genomes. For this purpose the homology of all member proteins to cellular genomes from eggNOG 4.5 have been determined with three different stringencies: High stringency: blastp e-Value <=1e-04 and hits in maximal 2 cellular genomes; Medium stringency: blastp e-Value <=1e-10 and hits in maximal 3 cellular genomes; Low stringency: blastp e-Value <=1e-15 and hits in maximal 4 cellular genomes; The column Only_in_viruses has been set true if members matched not more than the maximal number of genomes at the e-Value threshold for each stringency level. Columns: GroupName|Only in viruses (high stringency)|Only in viruses (medium stringency)|Only in viruses (low stringency) 1=True; 0=False. This file is useful to extract virus-specific markers from all VOGs, based on your preferred level of stringency.",
      "md5sum": "6078f9be67febcc34b4a0fa2c1c41ac8",
      "bytes": 87310,
      "url_label": "vfold.virusonly.tsv.gz (Specificity if groups to Viruses): 87,310 bytes, MD5 checksum 6078f9be67febcc34b4a0fa2c1c41ac8"
    },
    {
      "name": "vog.lca.tsv.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vog.lca.tsv.gz",
      "description": "Tab separated file of VOGs and the taxonomic lineage of the last common aencestor (LCA) of member genomes. Genomes with unclassified taxonomic lineages have not been used for LCA determination, which can result in VOG without lca (if all proteins of a VOG are from unclassified lineages). The numbers of genomes per VOG and LCA, as well as the total numbers of genomes in the LCA are given. Columns: GroupName|GenomesInGroupAndLCA|GenomesTotalInLCA|LastCommonAncestor_TaxonName|LastCommonAncestor_TaxonID",
      "md5sum": "5ae145fdc6ac76ecbc1febe1633a6006",
      "bytes": 600254,
      "url_label": "vog.lca.tsv.gz (Last common aencestors of groups): 600,254 bytes, MD5 checksum 5ae145fdc6ac76ecbc1febe1633a6006"
    },
    {
      "name": "vogdb.species.txt",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vogdb.species.txt",
      "description": "Tab separated file of virus genomes used for VOG construction. Columns: species name|taxon id|source|source version",
      "md5sum": "fed22b344786c2cda07be40e343752a0",
      "bytes": 792481,
      "url_label": "vogdb.species.txt (Virus genomes used for VOG construction): 792,481 bytes, MD5 checksum fed22b344786c2cda07be40e343752a0"
    },
    {
      "name": "vogdb.taxonomy.krona.html",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vogdb.taxonomy.krona.html",
      "description": "Interactive chart of virus genome taxonomies.",
      "md5sum": "b6c27cae5b65545b14ead93cc63581e3",
      "bytes": 6776506,
      "url_label": "vogdb.taxonomy.krona.html (Distribution of virus genome taxonomies): 6,776,506 bytes, MD5 checksum b6c27cae5b65545b14ead93cc63581e3"
    },
    {
      "name": "vfam.faa.tar.gz",
      "url": "https://fileshare.csb.univie.ac.at/vog/vog233/vfam.faa.tar.gz",
      "description": "Compressed archive of FASTA formatted files of the proteins per vogdb group.",
      "md5sum": "7a30bbf6eca8741c671ca4352e300279",
      "bytes": 64042065,
      "url_label": "vfam.faa.tar.gz (Protein sequences of groups): 64,042,065 bytes, MD5 checksum 7a30bbf6eca8741c671ca4352e300279"
    }
  ]
}
