Abstract:
Counting how many read mapped to the genome features (gene,
transcript, or exon) is a common task for mRNA-seq analysis.
mRNA sequencing
(mRNA-seq ) is widely adopted for the profilings of transcriptome.
Quantitation of expression level by mRNA-seq is count-based. More
read counted for a given feature indicate a higher level of
expression. Some R packages including edgeR and DESeq only address
read counts for differential expression analysis.
Counting reads is
always against the annotated features of genome. One difficult thing
is the relationship between the mapped read and the feature is not
always one on one. Figure 1 below showed multiple patterns
(http://www-huber.embl.de/users/anders/HTSeq/doc/count.html).
Figure
1. Rules of counting reads
The Python software
known as htseq-count can count reads based the mapping results in SAM
format. I present another computational Perl software I programmed
for counting reads at the end of this paper. Here is the comparison
of read counts determined by the two methods on the same mRNA-seq
experiment (Figure 2). It is fine to integrate one of the methods
into your personal pipeline for mRNA-seq analysis.
Figure 2. Comparison of Htseq-count and my Perl script
The Perl script
for counting reads
#! /usr/bin/perl -w
use strict;
use warnings;
use List::Util;
use DB_File;
#read gtf or gff file, the default is 'gene_id'
sub read_gff{
my $gff_file=$_[0];
print "read genome annotations\n";
my %annot_info;
open my($IN), "<", $gff_file or die;
while(<$IN>){#2
chomp($_);
my @items=split("\t", $_);
my $RNA_chr=$items[0];
my $RNA_specie=$items[2];
my $RNA_start=$items[3];
my $start_len=int($RNA_start/1e4);
my $RNA_end=$items[4];
my $end_len=int($RNA_end/1e4);
my $annot=$items[8];
$annot=~s/;$//;
my @annot=split(/;\s/, $annot);
my %hash;
foreach my $ann(@annot){
my($type, $id)=split(/\s/, $ann);
$id=~s/"//g;
$hash{$type}=$id;
#print "#$hash{$type}#\n";
}
#
if($RNA_specie eq 'exon' ){#3
$annot_info{$RNA_chr}->{$start_len}->{$hash{exon_id}}->{start}=$RNA_start;
$annot_info{$RNA_chr}->{$start_len}->{$hash{exon_id}}->{end}=$RNA_end;
if(exists
$annot_info{$RNA_chr}->{$start_len}->{$hash{exon_id}}->{gene_id}){#4
my @a=split(',',
$annot_info{$RNA_chr}->{$start_len}->{$hash{exon_id}}->{gene_id});
unless(List::Util::first {$_ eq $hash{gene_id}} @a){
push(@a, $hash{gene_id});
$annot_info{$RNA_chr}->{$start_len}->{$hash{exon_id}}->{gene_id}=join(',',
@a);
print "#$hash{exon_id}#@a\n";
}
}#4
else{#4
$annot_info{$RNA_chr}->{$start_len}->{$hash{exon_id}}->{gene_id}=$hash{gene_id};
}#4
unless($start_len==$end_len){#4
$annot_info{$RNA_chr}->{$end_len}->{$hash{exon_id}}->{start}=$RNA_start;
$annot_info{$RNA_chr}->{$end_len}->{$hash{exon_id}}->{end}=$RNA_end;
if(exists
$annot_info{$RNA_chr}->{$end_len}->{$hash{exon_id}}->{gene_id}){#5
my @a=split(',',
$annot_info{$RNA_chr}->{$end_len}->{$hash{exon_id}}->{gene_id});
unless(List::Util::first {$_ eq $hash{gene_id}} @a){
push(@a, $hash{gene_id});
$annot_info{$RNA_chr}->{$end_len}->{$hash{exon_id}}->{gene_id}=join(',',
@a);
}
}#5
else{#5
$annot_info{$RNA_chr}->{$end_len}->{$hash{exon_id}}->{gene_id}=$hash{gene_id};
}#5
}#4
}#3
}#2
close($IN);
return(\%annot_info);
}
#subroutine
sub read_sam_record{
my ($sam_record)=@_;
chomp($sam_record);
my @items=split("\t", $sam_record);
#get alignment record
my %sam_info;
$sam_info{query}=$items[0];
$sam_info{flags}=$items[1];
$sam_info{chr}=$items[2];
$sam_info{offset}=$items[3];
$sam_info{offset_len}=int($sam_info{offset}/1e4);
$sam_info{mapQ}=$items[4];
$sam_info{CIGAR}=$items[5];
$sam_info{mate_chr}=$items[6];
$sam_info{mate_offset}=$items[7];
$sam_info{fragment_size}=$items[8];
$sam_info{offend}=$sam_info{offset}+$sam_info{fragment_size};
$sam_info{offend_len}=int($sam_info{offend}/1e4);
$sam_info{seq}=$items[9];
$sam_info{seqQ}=$items[10];
for (my $i=11; $i<@items; $i++){
my($head, $middle, $value)=split(':', $items[$i]);
$sam_info{$head}=$value;
}
return(\%sam_info);
}
#subroutine
sub count_reads{
my ($sam_file, $annot_info_pointer)=@_;
my %annot_info=%$annot_info_pointer;
my $n=1;
my (%RC, %pair);
print "read sam file\n";
open my($SAM), "<", $sam_file or die;
while(<$SAM>){#2
my $sam_info_pointer=read_sam_record($_);
my %sam_info=%$sam_info_pointer;
#renew number of raw data
if( $sam_info{NH}==1 and $sam_info{fragment_size}>0){#3
my $chr=$sam_info{chr};
for my $str_len($sam_info{offset_len}..$sam_info{offend_len}){#4
if(exists $annot_info{$chr}->{$str_len}){#5
my $pointer=$annot_info{$chr}->{$str_len};
my %annot=%$pointer;
my @target_ref;
foreach my $gene_id(keys %annot){#6
my $annot_start=$annot{$gene_id}->{start};
my $annot_end=$annot{$gene_id}->{end};
my $annot_id=$annot{$gene_id}->{gene_id};
my $start=$sam_info{offset};
my $end=$sam_info{offend};
if( $annot_start>=$start){
if($annot_end<=$end or ($annot_end>$end and
($annot_end+10)<=$end) ){
push(@target_ref, $annot_id) unless List::Util::first
{$annot_id eq $_} @target_ref;
}
}
else{
if ($annot_end>=$start+10 or $annot_end>=$end ){
push(@target_ref, $annot_id) unless List::Util::first
{$annot_id eq $_} @target_ref;
}
}
last if @target_ref>1;
}#6
if(@target_ref==1){
my $unique_ref=shift @target_ref;
$RC{$unique_ref}++;
}
}#5
}#4
$n++;
print "$n\n";
}#3
}#2
close($SAM);
print "export to the txt file\n";
my
$file='/home/yuan/mysql_pre/cookbooks/result_gene_read_counts.txt';
open my($OUT), ">", $file or die;
foreach my $ref(keys %RC){
my $read_counts=$RC{$ref};
print $OUT "$ref\t", "$read_counts\n";
}
close($OUT);
#
}
########################
#main programm
########################
#genome annotation
my $gff_file="/home/yuan/eRNA/ref_seq/human_Ensembl_GRCh38.gtf";
my $annot_info_pointer=read_gff($gff_file);
my %annot_info=%$annot_info_pointer;
#RNA satuation analysis
my $sam_file="/home/yuan/data_2/test_accepted_hits.sorted.sam";
count_reads($sam_file, \%annot_info);
print "OK\n";
Reference
S Anders, T P Pyl, W
Huber: HTSeq — A Python framework to work with high-throughput
sequencing data. Bioinformatics, 2014, in print; online at
doi:10.1093/bioinformatics/btu638.
Writing date:
2014.04.10, 2015.02.10
No comments:
Post a Comment