#!/usr/bin/perl -w use strict; if ($#ARGV != 2) { print "usage: $0 fasta out length\n"; exit; } open (IN, "<$ARGV[0]") || die "cannot open $ARGV[0]: $!\n"; open (OUT, ">$ARGV[1]") || die "cannot open $ARGV[1]: $!\n"; my %data=(); my @names=(); my $seq=''; my $count=0; my $len=0; while (<IN>) { chomp; if (/^>(.+)/) { if ($seq) { for (my $i=0; $i < length($seq)-$ARGV[2]; $i++) { $data{substr($seq, $i, $ARGV[2])}{$count}++; } $len+=length($seq); $seq=''; } push(@names, $1); $count++; } else { $seq.=$_; } } for (my $i=0; $i < length($seq)-$ARGV[2]; $i++) { $data{substr($seq, $i, $ARGV[2])}{$count}++; } $len+=length($seq); $seq=''; warn scalar @names, " sequences of $len bp total read\n"; warn scalar keys %data, " kmers identified\n"; foreach (sort {scalar keys %{$data{$a}} <=> scalar keys %{$data{$b}}} keys %data) { print OUT "$_\t", scalar keys %{$data{$_}}, "\t", join ("\t", sort {$a <=> $b}keys %{$data{$_}}), "\n"; }