#!/usr/bin/perl
##========================================================================
## Author: Lizhi.Xu
## Email: xulzh2010@mail.com
## File Name: fatools.pl
## Description:
##
## Edit History:
##       2012-04-18      File created.
##========================================================================

use strict;
use warnings;

use FindBin;
use Getopt::Long;

&usage() if @ARGV < 1;
my $cmd = shift;
my %func = (format => \&format, sort => \&sort, stats => \&stats, comp => \&comp, subseq => \&subseq, search => \&search, split => \&split);

&usage("Unknown commamd \'$cmd\'") unless (exists $func{$cmd});
&{$func{$cmd}}();
exit;

sub format {
	my %opts = (length => 0);
	GetOptions (\%opts, 'help', 'output=s', 'length=i', 'rename=s', 'desc=s', 'revcom') or &usage();
	&usage("No input and show $cmd help info:\n") if (exists $opts{help} or @ARGV < 1 and (-t STDIN));
	open STDOUT, ">$opts{output}" or die $! if exists $opts{output};
	my ($tot_num, $tot_len, $get_num, $get_len);
	$/ = "\n>";
	while (<>) {
		chomp;
		s/^>//g;
		next if /^\s*$/;
        s/\r//g;
		my ($head, $seq) = split (/\n/, $_, 2);
		my ($id, $desc) = split (/\s+/, $head, 2);
		$seq =~ s/[\n\s\r]//g;
		my $len = length $seq;
		$tot_num ++;
		$tot_len += $len;
		next if $len < $opts{length};
		$get_num ++;
		$get_len += $len;
		$seq = uc $seq;
		if (exists $opts{revcom}) {
			$seq =~ tr/ATGCN/TACGN/;
			$seq = reverse $seq;
		}
		$seq =~ s/(\w{60})/$1\n/g;
		$seq =~ s/\n+$//;
		$desc = $opts{desc} if exists $opts{desc};
		$id = sprintf ("$opts{rename}_%05d", $get_num) if exists $opts{rename};
		if (defined $desc) {
			printf (">$id $desc\n$seq\n");
		} else {
			printf (">$id\n$seq\n");
		}
	}
	$/ = "\n";
	warn "# Total sequence number $tot_num, length $tot_len bp\n";
	warn "# Write sequence number $get_num, length $get_len bp\n";
}

sub sort {
	my %opts = (length => 0);
	GetOptions (\%opts, 'help', 'output=s', 'type=s', 'length=i', 'rename=s', 'desc=s', 'revcom') or &usage();
	&usage("No input and show $cmd help info:\n") if (exists $opts{help} or @ARGV < 1 and (-t STDIN));
	open STDOUT, ">$opts{output}" or die $! if exists $opts{output};
	my ($tot_num, $tot_len, $get_num, $get_len);
	my %hash;
	$/ = "\n>";
	while (<>) {
		chomp;
		s/^>//g;
		next if /^\s*$/;
		my ($head, $seq) = split (/\n/, $_, 2);
		my ($id, $desc) = split (/\s+/, $head, 2);
		$seq =~ s/[\n\s\r]//g;
		my $len = length $seq;
		$tot_num ++;
		$tot_len += $len;
		next if $len < $opts{length};
		$seq = uc $seq;
		if (exists $opts{revcom}) {
			$seq =~ tr/ATGCN/TACGN/;
			$seq = reverse $seq;
		}
		$hash{$id}{seq} = $seq;
		$hash{$id}{desc} = $desc if defined $desc;
		$hash{$id}{len} = $len;
	}
	$/ = "\n";
	if ($opts{type} eq 'length') {
		foreach my $id (sort {$hash{$b}{len} <=> $hash{$a}{len}} keys %hash) {
			$get_num ++;
			$get_len += $hash{$id}{len};
			my $seq = $hash{$id}{seq};
			$seq =~ s/(\w{60})/$1\n/g;
			$seq =~ s/\n+$//;
			my $name;
			if (exists $opts{rename}) {
				$name = sprintf ("$opts{rename}_%05d", $get_num);
			} else {
				$name = $id; 
			}
			if (exists $hash{$id}{desc}) {
				printf (">$name %s\n$seq\n", $hash{$id}{desc});
			} else {
				printf (">$name\n$seq\n");
			}
		}
	} elsif ($opts{type} eq 'id') {
		foreach my $id (sort keys %hash) {
			$get_num ++;
			$get_len += $hash{$id}{len};
			my $seq = $hash{$id}{seq};
			$seq =~ s/(\w{60})/$1\n/g;
			$seq =~ s/\n+$//;
			my $name;
			if (exists $opts{rename}) {
				$name = sprintf ("$opts{rename}_%05d", $get_num);
			} else {
				$name = $id; 
			}
			if (exists $hash{$id}{desc}) {
				printf (">$name %s\n$seq\n", $hash{$id}{desc});
			} else {
				printf (">$name\n$seq\n");
			}
		}
	}
	warn "# Total sequence number $tot_num, length $tot_len bp\n";
	warn "# Write sequence number $get_num, length $get_len bp\n";
}



sub comp {
	my %opts = (prefix => 'contig', gaptype => 'scaffold', linkage => 'yes', evidence => 'paired-end');
	GetOptions (\%opts, 'help', 'output=s', 'agpfile=s', 'prefix=s', 'gaptype=s', 'linkage=s', 'evidence=s') or &usage();
	&usage("No input and show $cmd help info:\n") if (exists $opts{help} or @ARGV < 1 and (-t STDIN));
	open STDOUT, ">$opts{output}" or die $! if exists $opts{output};
	if (exists $opts{agpfile}) {
		open AGP, ">$opts{agpfile}" or die $!;
		printf AGP ("##agp-version  2.0\n");
	}
	my (%hash, %stats);
	my $ctg_num = 0;
	$/ = "\n>";
	while (<>) {
		chomp;
		s/^>//g;
		next if /^\s*$/;
		my ($head, $seq) = split (/\n/, $_, 2);
		my ($id, $desc) = split (/\s+/, $head, 2);
		$seq =~ s/[\n\s\r]//g;
		$seq = uc $seq;
		my $num = 0;
		my ($start, $end, $len) = (1, 0, 0);
		while (length $seq > 0) {
			if ($seq =~ s/^([ATGC]+)//) {
				my $str = $1;
				$ctg_num ++;
				$num ++;
				$len = length $str;
				$end += $len;
				$str =~ s/(\w{60})/$1\n/g;
				$str =~ s/\n+$//;
				my $ctg_id = sprintf ("$opts{prefix}_%05d", $ctg_num);
				printf (">$ctg_id $id:$start-$end:$len\n$str\n");
				printf AGP ("$id\t$start\t$end\t%d\tW\t$ctg_id\t1\t$len\t+\n", $num) if exists $opts{agpfile};
				$start += $len;
			} elsif ($seq =~ s/^(N+)//) {
				$num ++;
				$len = length $1;
				$end += $len;
				printf AGP ("$id\t$start\t$end\t%d\tN\t$len\tscaffold\tyes\tpaired-ends\n", $num) if exists $opts{agpfile};
				$start += $len;
			} else {
				$seq =~ s/(\w)//;
				die "Found unknown char \'$1\'\n";
			}
		}
	}
	$/ = "\n";
}						

sub subseq {
	my %opts = ();
	GetOptions (\%opts, 'help', 'output=s', 'prefix=s', 'list=s', 'file=s') or &usage();
	&usage("No input and show $cmd help info:\n") if (exists $opts{help} or @ARGV < 1 and (-t STDIN));

	# get sub seq info
	my ($tot_num, $get_num, %hash);
	if (exists $opts{list}) {
		foreach my $info (split (/[,;]/, $opts{list})) {
			$tot_num ++;
			my ($id, @list) = split (/:/, $info);
			push (@{$hash{$id}}, [@list]);
		}
	}
	if (exists $opts{file}) {
		open FH, $opts{file} or die $!;
		while (<FH>) {
			chomp;
			next if (/^\s*$/ or /^\s*#/);
			$tot_num ++;
			my ($id, @list) = split (/\s+/, $_);
			push (@{$hash{$id}}, [@list]);
		}
		close FH;
	}
	
	# check input sub seq info
	if (defined $tot_num) {
		warn "# Total input sub sequence information $tot_num\n";
	} else {
		&usage('Not Found input sub sequence information');
	}
	
	open STDOUT, ">$opts{output}" or die $! if exists $opts{output};
	$/ = "\n>";
	while (<>) {
		chomp;
		s/^>//g;
		next if /^\s*$/;
		my ($head, $seq) = split (/\n/, $_, 2);
		my ($id, $desc) = split (/\s+/, $head, 2);
		$seq =~ s/[\n\s\r]//g;
		my $len = length $seq;
		$seq = uc $seq;
		if (exists $hash{$id}) {
			foreach my $info (@{$hash{$id}}) {
				$get_num ++;
				my ($start, $end, $strand) = @$info;
				$strand = '+' unless defined $strand;
				my $subseq = substr ($seq, $start - 1, $end - $start + 1);
				if ($strand eq '-' or $strand eq 'R') {
					$subseq = reverse $subseq;
					$subseq =~ tr/ATGCN/TACGN/;
				}
				$subseq =~ s/(\w{60})/$1\n/g;
				$subseq =~ s/\n+$//;
				if (exists $opts{prefix}) {
					printf (">$opts{prefix}_%05d $id:$start-$end:$strand\n$subseq\n", $get_num);
				} else {
					printf (">$id:$start-$end:$strand\n$subseq\n");
				}
			}
		}
	}
	$/ = "\n";
	warn "# Total write sub sequence $get_num/$tot_num\n";
}



sub search {
	my %opts = ();
	GetOptions (\%opts, 'help', 'output=s', 'list=s', 'file=s') or &usage();
	&usage("No input and show $cmd help info:\n") if (exists $opts{help} or @ARGV < 1 and (-t STDIN));

	# get sub seq info
	my ($tot_num, $get_num, %hash) = (0, 0);
	if (exists $opts{list}) {
		foreach my $id (split (/[,;]/, $opts{list})) {
			$tot_num ++;
			$hash{$id} ++;
		}
	}
	if (exists $opts{file}) {
		open FH, $opts{file} or die $!;
		while (<FH>) {
			chomp;
			next if (/^\s*$/ or /^\s*#/);
			my ($id) = split (/\s+/, $_);
			$tot_num ++;
			$hash{$id} ++;
		}
		close FH;
	}
	
	# check input sub seq info
	my $uniq_num = keys %hash;
	if (defined $tot_num) {
		warn "# Total input search ID information $uniq_num/$tot_num\n";
	} else {
		&usage('Not Found input search ID information');
	}
	
	open STDOUT, ">$opts{output}" or die $! if exists $opts{output};
	$/ = "\n>";
	while (<>) {
		chomp;
		s/^>//g;
		next if /^\s*$/;
		my ($head, $seq) = split (/\n/, $_, 2);
		my ($id, $desc) = split (/\s+/, $head, 2);
		if (exists $hash{$id}) {
			$get_num ++;
			$seq =~ s/[\n\s\r]//g;
			$seq = uc $seq;
			$seq =~ s/(\w{60})/$1\n/g;
			$seq =~ s/\n+$//;
			if (defined $desc) {
				printf (">$id $desc\n$seq\n");
			} else {
				printf (">$id\n$seq\n");
			}
			delete $hash{$id};
		}
	}
	$/ = "\n";
	if ($get_num == $uniq_num) {
		warn "# Search all list ID sequence $get_num/$uniq_num/$tot_num\n";
	} else {
		warn "# Total search $get_num/$uniq_num, and not search ID list: " . join (', ', keys %hash) . "\n";
	}
}

sub split {
	my %opts = (number => 4, model => 'file', output => 'part');
	GetOptions (\%opts, 'help', 'output=s', 'model=s', 'number=i') or &usage();
	&usage("No input and show $cmd help info:\n") if (exists $opts{help} or @ARGV < 1 and (-t STDIN));
	my ($tot_num, $tot_len) = (0, 0);
	my @list;
	$/ = "\n>";
	while (<>) {
		chomp;
		s/^>//g;
		next if /^\s*$/;
		my ($head, $seq) = split (/\n+/, $_, 2);
		my ($id, $desc) = split (/\s+/, $head, 2);
		$seq =~ s/[\n\s\r]//g;
		$seq = uc $seq;
		$tot_num ++;
		$tot_len += length $seq;
		push(@list, [$id, $seq]);
	}
	
	if ($opts{model} eq 'seq' or $opts{model} eq 's') {
		my $file_num = 0;
		for (my $i = 0; $i < $tot_num; $i ++) {
			if ($i % $opts{number} == 0) {
				$file_num ++;
				open STDOUT, ">$opts{output}\_$file_num.fa" or die $!;
			}
			my $seq = $list[$i]->[1];
			$seq =~ s/(\w{60})/$1\n/g;
			$seq =~ s/\n+$//;
			printf (">%s\n$seq\n", $list[$i]->[0]);
		}
	} elsif ($opts{model} eq 'file' or $opts{model} eq 'f') {
		for (my $i = 1; $i <= $opts{number}; $i ++) {
			open STDOUT, ">$opts{output}_$i.fa" or die $!;
			for (my $j = $i - 1; $j < $tot_num; $j += $opts{number}) {
				my $seq = $list[$j]->[1];
				$seq =~ s/(\w{60})/$1\n/g;
				$seq =~ s/\n+$//;
				printf (">$list[$j]->[0]\n$seq\n");
			}
			close STDOUT;
		}
	} else {
		&usage("Unknown model: $opts{model}\n");
	}
	$/ = "\n";
}

sub stats {
	my %opts = (length => 0, format => 'table');
	GetOptions (\%opts, 'help', 'output=s', 'length=i', 'format=s', 'nohead') or &usage();
	&usage("No input and show $cmd help info:\n") if (exists $opts{help} or @ARGV < 1 and (-t STDIN));
	open STDOUT, ">$opts{output}" or die $! if exists $opts{output};
	my (%stats, @scf, @ctg, @gap);
	my ($scf_num, $scf_len, $ctg_num, $ctg_len, $gap_num, $gap_len, $gc) = (0, 0, 0, 0, 0, 0);
	$/ = "\n>";
	while (<>) {
		chomp;
		s/^>//g;
		next if /^\s*$/;
		my ($head, $seq) = split (/\n/, $_, 2);
		my ($id, $desc) = split (/\s+/, $head, 2);
		$seq =~ s/[\n\s\r]//g;
		$seq = uc $seq;
		my $slen = length $seq;
		next if ($slen < $opts{length});
		$scf_num ++;
		$scf_len += $slen;
		# $ctg_len = ($seq =~ s/([ATGC])/$1/g);
		$gc += ($seq =~ s/([GC])/$1/g);
		push (@scf, $slen);
		while (length $seq > 0) {
			if ($seq =~ s/^([ATGC]+)//) {
				$ctg_num ++;
				my $clen = length $1;
				$ctg_len += $clen;
				push (@ctg, $clen);
			} elsif ($seq =~ s/^(N+)//) {
				$gap_num ++;
				my $glen = length $1;
				$gap_len += $glen;
				push (@gap, $glen);
			} else {
				$seq =~ s/(\w)//;
				warn "Found unknown char \'$1\'\n";
			}
		}
	}
	$/ = "\n";
	
	$stats{scf} = &_stats ($scf_len, @scf);
	$stats{ctg} = &_stats ($ctg_len, @ctg);
	$stats{gap} = &_stats ($gap_len, @gap);
	if ($opts{format} eq 'list') {
		printf ("#ScfNum\tScfLen\tScfMin\tScfMax\tScfAvg\tScfN50\tScfN90\tCtgNum\tCtgLen\tCtgMin\tCtgMax\tCtgAvg\tCtgN50\tCtgN90\tGapNum\tGapLen\tGC(%%)\n") unless exists $opts{nohead};
		printf ("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%.2f\n",&_num($stats{scf}->{num}),&_num($stats{scf}->{len}),&_num($stats{scf}->{min}),&_num($stats{scf}->{max}),&_num($stats{scf}->{avg}),&_num($stats{scf}->{n50}),&_num($stats{scf}->{n90}),&_num($stats{ctg}->{num}),&_num($stats{ctg}->{len}),&_num($stats{ctg}->{min}),&_num($stats{ctg}->{max}),&_num($stats{ctg}->{avg}),&_num($stats{ctg}->{n50}),&_num($stats{ctg}->{n90}),&_num($stats{gap}->{num}),&_num($stats{gap}->{len}),$gc/$ctg_len*100);
	} elsif ($opts{format} eq 'table') {
		printf ("Total GC content percentage: %2.2f%%\n",$gc/$ctg_len*100);
		printf ("------------------------------------------------------------\n");
		printf ("%-12s%15s%15s%15s\n",'','SCAFFOLD','CONTIG','GAP');
		printf ("------------------------------------------------------------\n");
		my %hash_tag = (num => 'Tot_Num', len => 'Tot_Len(bp)', min => 'Min_Len(bp)', max => 'Max_Len(bp)', avg => 'Ave_Len(bp)', n50 => 'N50(bp)', n60 => 'N60(bp)', n70 => 'N70(bp)', n80 => 'N80(bp)', n90 => 'N90(bp)');
		foreach my $tag (qw/num len min max avg n50 n60 n70 n80 n90/) {
			printf ("%-12s%15s%15s%15s\n",$hash_tag{$tag},&_num($stats{scf}->{$tag}),&_num($stats{ctg}->{$tag}),&_num($stats{gap}->{$tag}));
		}
		printf ("------------------------------------------------------------\n");
	}
}


sub usage {
	if (!defined $cmd) {
		print qq(@_
	Usage:
		perl $0 <cmd> [opts] <input> [...]
	Command:
		<format>    format fasta sequence file
		<sort>      sorted sequence by length or id
		<comp>      get sequence component
		<stats>     statistics sequences information
		<split>     split sequence file
		<subseq>    cut sequence by sub information
		<search>    search sequence by id
	\n);
	} elsif ($cmd eq 'format') {
		print qq(@_
	Usage:
		perl $0 $cmd <seq.fa> [...]
	Options:
		-output <FILE>  file to write output [STDOUT]
		-length <INT>   filter sequence by length [0]
		-rename <STR>   rename sequence ID [none]
		-desc   <STR>   description of sequence [none]
		-revcom         reverse complement sequence
	\n);
	} elsif ($cmd eq 'sort') {
		print qq(@_
	Usage:
		perl $0 $cmd <seq.fa> [...]
	Options:
		-output <FILE>  file to write output [STDOUT]
		-length <INT>   filter sequence by length [0]
		-rename <STR>   rename sequence ID [none]
		-type   <STR>   sort sequence by length/id [length]
		-desc   <STR>   description of sequence [none]
	\n);
	} elsif ($cmd eq 'stats') {
		print qq(@_
	Usage:
		perl $0 $cmd <seq.fa> [...]
	Options:
		-output <FILE>  file to write output [STDOUT]
		-format <STR>   output format table/list [table]
		-length <INT>   minimum sequence length [0]
		-nohead         don't output header line
	\n);
	} elsif ($cmd eq 'comp') {
		print qq(@_
	Usage:
		perl $0 $cmd <seq.fa> [...]
	Options:
		-output  <FILE>  component file to write output [STDOUT]
		-agpfile <FILE>  agp file to write output [none]
		-prefix   <STR>  prefix of component sequence [contig]
		-gaptype  <STR>  type of gap [scaffold]
		-linkage  <STR>  evidence of linkage [yes]
		-evidence <STR>  type of evidence used to linkage [paired-ends]
	\n);
	} elsif ($cmd eq 'split') {
		print qq(@_
	Usage:
		perl $0 $cmd <seq.fa> [...]
	Options:
		-output   <STR>  split files name prefix to write output [part]
		-number   <STR>  number of split for the model [4]
		-model    <STR>  split model select file/seq [file]
	\n);
	} elsif ($cmd eq 'subseq') {
		print qq(@_
	Usage:
		perl $0 $cmd <seq.fa> [...]
	Options:
		-output  <FILE>  component file to write output [STDOUT]
		-prefix  <STR>   prefix of output sequence ID [none]
		-list    <STR>   list of sub sequence information [ID:START:END:STRAND,...]
		-file    <STR>   file or sub sequence information
	\n);
	} elsif ($cmd eq 'search') {
		print qq(@_
	Usage:
		perl $0 $cmd <seq.fa> [...]
	Options:
		-output  <FILE>  component file to write output [STDOUT]
		-list    <STR>   list of sub sequence information [ID,...]
		-file    <STR>   file or sub sequence information
	\n);
	} else {
		print qq(Unknown command \'$cmd\', check it run again.);
		$cmd = undef;
		&usage();
	}
	exit;
}

sub _num {
	my $num = shift;
	while ($num =~ s/(\d)(\d{3})((,\d{3})*)$/$1,$2$3/) {}
	return $num;
}

sub _stats {
	my (%stat,@args);
	($stat{len}, @args) = @_;
    $stat{num} = @args;
    if ($stat{num} == 0) {
    	$stat{avg} = $stat{max} = $stat{min} = $stat{n50} = $stat{n60} = $stat{n70} = $stat{n80} = $stat{n90} = 0;
    } else {
    	$stat{avg} = sprintf("%d",$stat{len}/$stat{num});
    	my $sum = 0;
    	foreach my $len (sort {$b <=> $a} @args) {
    		$stat{max} = $len if !exists $stat{min};
    		$stat{min} = $len;
    		$sum += $len;
    		$stat{n50} = $len if ($sum/$stat{len} >= 0.5 and !exists $stat{n50});
    		$stat{n60} = $len if ($sum/$stat{len} >= 0.6 and !exists $stat{n60});
    		$stat{n70} = $len if ($sum/$stat{len} >= 0.7 and !exists $stat{n70});
    		$stat{n80} = $len if ($sum/$stat{len} >= 0.8 and !exists $stat{n80});
    		$stat{n90} = $len if ($sum/$stat{len} >= 0.9 and !exists $stat{n90});
    	}
    }
    return \%stat;
}



