Abstract:
The FASTA and GTF/GFF format files should be prepared when genome
mapping using the aligner.
Of
those files with genome annotation downloaded from NCBI or Esemble
public database, the FASTA and GTF/GFF format files should be
prepared when genome mapping using the aligner, namely TopHat. The
FASTA file containing chromosome sequences is used by Bowtie. With
GTF/GFF file options, TopHat will first extract the transcript
sequences and use Bowtie to align reads to those virtual
transcriptome first. So, the annotations between FASTA and GFF/GTF
files should be matched to each other. The
values in the first column of the GTF/GFF file must match the name of
the reference sequence in FASTA file.
There
are some files available for some common species at
http://ccb.jhu.edu/software/tophat/igenomes.shtml.
For other species without listed in the site or the involvement of
the last annotation released, you have to make these files by
yourself. Here, I provide some Perl code.
use strict;
use warnings;
use Bio::SeqIO;
use Bio::Seq::Quality;
use Bio::Perl;
use List::MoreUtils;
use List::Util;
use File::Find;
#subroutines
########################
#
sub get_fasta{
my ($genome_dir, $chr_pointer, $out_fasta_file)=@_;
my @chr=@$chr_pointer;
open my($FA), ">", $out_fasta_file or die;
foreach my $chr(@chr){
my
$fasta_file=$genome_dir.'Homo_sapiens.GRCh38.dna.chromosome.'.$chr.'.fa';
print "$fasta_file\n";
my $in_obj = Bio::SeqIO->new(-file =>$fasta_file, -format =>
'fasta');
while (my $seq_obj = $in_obj->next_seq() ) {#3
my $seq=$seq_obj->seq();
print $FA ">$chr\n", "$seq\n";
print ">$chr\n";
}
}
close($FA);
}
##################################
sub get_NCBI_gff{
my ($genome_dir, $chr_pointer, $chr_acc_pointer, $out_gff_file)=@_;
my @chr=@$chr_pointer;
my %chr_acc=%$chr_acc_pointer;
my $gff_file=$genome_dir.'ref_GRCh38_top_level.gff3';
print "$gff_file\n";
open my($GFF), ">", $out_gff_file or die;
foreach my $chr_num(@chr){#2
my $acc=$chr_acc{$chr_num};
print "$chr_num\n";
open my($IN), "<", $gff_file or die;
while (my $line=<$IN>) {#3
if($line=~/^$acc/){
$line=~s/^$acc/$chr_num/;
$line=~s/,/;/g;
$line=~s/:/=/g;
$line=~s/GeneID/gene_id/g;
$line=~s/Dbxref=//g;
print $GFF $line;
}
}#3
close($IN);
}#2
close($GFF);
}
sub get_ensembl_gtf{
my ($gtf_file, $chr_pointer, $out_gtf_file)=@_;
my @chr=@$chr_pointer;
print "$gtf_file\n";
open my($GFF), ">", $out_gtf_file or die;
foreach my $chr(@chr){#2
print "$chr\n";
open my($IN), "<", $gtf_file or die;
while (my $line=<$IN>) {#3
if($line=~/^$chr\t/){
print $GFF $line;
}
}#3
close($IN);
}#2
close($GFF);
}
############
#main programm
##############
#get genome fasta filr
my @chr=qw(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X
Y MT);
my
$genome_dir='/home/yuan/data_preparation/ftp.ensembl.org/pub/current_fasta/homo_sapiens/dna/tmp/';
my $out_fasta_file="/home/yuan/eRNA/ref_seq/human_Ensembl_GRCh38.fa";
#subroutine
get_fasta($genome_dir, \@chr, $out_fasta_file);
#
my %chr_acc=( '1'=>'NC_000001.11', '2'=>'NC_000002.12',
'3'=>'NC_000003.12', '4'=>'NC_000004.12', '5'=>'NC_000005.10',
'6'=>'NC_000006.12', '7'=>'NC_000007.14',
'8'=>'NC_000008.11', '9'=>'NC_000009.12', '10'=>'NC_000010.11',
'11'=>'NC_000011.10', '12'=>'NC_000012.12',
'13'=>'NC_000013.11', '14'=>'NC_000014.9',
'15'=>'NC_000015.10',
'16'=>'NC_000016.10', '17'=>'NC_000017.11',
'18'=>'NC_000018.10', '19'=>'NC_000019.10',
'20'=>'NC_000020.11',
'21'=>'NC_000021.9', '22'=>'NC_000022.11',
'X'=>'NC_000023.11', 'Y'=>'NC_000024.10', 'MT'=>'NC_012920.1',
);
my $out_gff_file="/home/yuan/eRNA/ref_seq/human_ref_GRCh38.gff";
#subroutine
get_NCBI_gff($genome_dir, \@chr, \%chr_acc, $out_gff_file);
my
$gtf_file='/home/yuan/data_preparation/ftp.ensembl.org/pub/current_gtf/homo_sapiens/Homo_sapiens.GRCh38.76.gtf';
my $out_gtf_file="/home/yuan/eRNA/ref_seq/human_Ensembl_GRCh38.gtf";
get_ensembl_gtf($gtf_file, \@chr, $out_gtf_file);
Writing
date: 2012.10.10, 2015.02.20