From bb9e38a67a2c325dd757cb007706d74daa0aa462 Mon Sep 17 00:00:00 2001 From: nylander Date: Mon, 30 Sep 2024 16:17:15 +0200 Subject: [PATCH] Add -b --- LICENSE | 2 +- README.md | 53 +++++++++++++++++++++++++++++------------------ catfasta2phyml.pl | 40 +++++++++++++++++++++++++++++------ 3 files changed, 68 insertions(+), 27 deletions(-) diff --git a/LICENSE b/LICENSE index 56d84cc..6ede8b5 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2010-2020 Johan Nylander +Copyright (c) 2010-2024 Johan Nylander Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 370d9fb..9259c01 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,8 @@ ### NAME -`catfasta2phyml.pl` -- Concatenate FASTA alignments to PHYML, PHYLIP, or FASTA format +`catfasta2phyml.pl` -- Concatenate FASTA alignments to PHYML, PHYLIP, or FASTA +format ### SYNOPSIS @@ -12,49 +13,57 @@ - **-h, -?, --help** - Print a brief help message and exits. +Print a brief help message and exits. - **-m, --man** - Prints the manual page and exits. +Prints the manual page and exits. - **-c, --concatenate** - Concatenate files even when number of taxa differ among alignments. - Missing data will be filled with all gap (-) sequences. +Concatenate files even when number of taxa differ among alignments. Missing +data will be filled with all gap (-) sequences. - **-i, --intersect** - Concatenate sequences for sequence labels occuring in all input files - (intersection). +Concatenate sequences for sequence labels occuring in all input files +(intersection). - **-f, --fasta** - Print output in FASTA format (default is PHYML format). +Print output in FASTA format (default is PHYML format). - **-p, --phylip** - Print output in a strict PHYLIP format. - See [http://evolution.genetics.washington.edu/phylip/doc/sequence.html](http://evolution.genetics.washington.edu/phylip/doc/sequence.html). +Print output in a strict PHYLIP format. See section "Data file format" on +[https://phylipweb.github.io/phylip/doc/main.html#inputfiles](https://phylipweb.github.io/phylip/doc/main.html#inputfiles) - **Note:** The current output is not entirely strict for the - interleaved format. Left to do is to efficiently print sequences - in blocks of 10 characters. The sequential PHYLIP format works, - on the other hand (use **-s** in combination with **-p**). +**Note:** The current output is not entirely strict for the interleaved format. +Left to do is to efficiently print sequences in blocks of 10 characters. The +sequential PHYLIP format works, on the other hand (use **-s** in combination +with **-p**). - **-s, --sequential** - Print output in sequential format (default is interleaved). +Print output in sequential format (default is interleaved). + +- **-b, --basename=suffix** + +Ensure the basename is used as partition definition. If the provided **suffix** +(required) matches the file suffix, it will be removed from the output string. + +**Note:** If the suffix it to be kept, one may use this format: **--basename=' +'** (basically providing a string that will not match the file suffix). - **-v, --verbose** - Be verbose by showing some useful output. See the combination with **-n**. +Be verbose by showing some useful output. See the combination with **-n**. - **-n, --noprint** - Do not print the concatenation, just check if all files have the same - sequence lables and lengths. Program returns 1 on exit. - See also the combination with **-v**. +Do not print the concatenation, just check if all files have the same sequence +lables and lengths. Program returns 1 on exit. See also the combination with +**-v**. - **-V, --version** @@ -112,6 +121,10 @@ To concatenate sequences for sequence labels occuring in all files: $ catfasta2phyml.pl --intersect *.fas +To ensure basename as name and suffix removal in partition definition: + + $ catfasta2phyml.pl -b.fas dat/file1.fas dat/file2.fas > out.phy + ### TIPS **1. "Argument list too long" error?** @@ -188,7 +201,7 @@ Uses Perl modules Getopt::Long and Pod::Usage ### LICENSE AND COPYRIGHT -Copyright (c) 2010-2022 Johan Nylander +Copyright (c) 2010-2024 Johan Nylander Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/catfasta2phyml.pl b/catfasta2phyml.pl index ae9ac96..f498d12 100755 --- a/catfasta2phyml.pl +++ b/catfasta2phyml.pl @@ -11,13 +11,14 @@ use warnings; use Pod::Usage; use Getopt::Long; +use File::Basename; Getopt::Long::Configure("bundling_override", "no_ignore_case"); #--------------------------------------------------------------------------- # Global variables #--------------------------------------------------------------------------- -my $VERSION = '1.2.0'; +my $VERSION = '1.2.1'; my %HoH = (); my %seqids = (); # my %nseq_hash = (); # key:infile, val:nseq @@ -44,6 +45,7 @@ my $lwidth = 60; # Default line width for fasta my $nt_counter = 1; # Counter for partitions my $end_count = 0; # Counter for partitions +my $basename = 0; # Basename #--------------------------------------------------------------------------- @@ -56,6 +58,7 @@ GetOptions( 'h|help|?' => sub { pod2usage(1) }, 'm|man' => sub { pod2usage(-exitstatus => 0, -verbose => 2) }, + 'b|basename=s' => \$basename, 'c|concatenate' => \$concatenate, 'f|fasta' => \$fasta, 'i|intersect' => \$intersect, @@ -216,7 +219,13 @@ foreach my $file (@infiles) { die "\n\nError: $file not in HoH\n" unless exists(${HoH}{$file}); $end_count = $nt_counter + ${HoH}{$file}{'nchars'} - 1; - print STDERR "$file = $nt_counter-$end_count\n"; + if ($basename) { + my $f = basename($file, $basename); + print STDERR "$f = $nt_counter-$end_count\n"; + } + else { + print STDERR "$file = $nt_counter-$end_count\n"; + } $nt_counter = $nt_counter + ${HoH}{$file}{'nchars'}; my @seq_ids = (); if ($intersect) { @@ -253,7 +262,13 @@ foreach my $file (@infiles) { die "\n\nError: $file not in HoH\n" unless exists(${HoH}{$file}); $end_count = $nt_counter + ${HoH}{$file}{'nchars'} - 1; - print STDERR "$file = $nt_counter-$end_count\n"; + if ($basename) { + my $f = basename($file, $basename); + print STDERR "$f = $nt_counter-$end_count\n"; + } + else { + print STDERR "$file = $nt_counter-$end_count\n"; + } $nt_counter = $nt_counter + ${HoH}{$file}{'nchars'}; my @seq_ids = (); if ($intersect) { @@ -441,7 +456,7 @@ sub phylip_blocks { #=== POD DOCUMENTATION ======================================================= -# VERSION: Mon 21 nov 2022 12:56:41 +# VERSION: Mon 30 Sep 2024 13:38:57 # DESCRIPTION: Documentation # TODO: ? #=============================================================================== @@ -504,6 +519,15 @@ =head1 OPTIONS Print output in sequential format (default is interleaved). +=item B<-b, --basename=suffix> + +Ensure the basename is used as partition definition. If the provided C +(required) matches the file suffix, it will be removed from the output string. + +B If the suffix it to be kept, one may use this format: C<--basename=' '> +(basically providing a string that will not match the file suffix). + + =item B<-v, --verbose> Be verbose by showing some useful output. See the combination with B<-n>. @@ -571,6 +595,10 @@ =head1 USAGE catfasta2phyml.pl --intersect *.fas +To ensure basename as name and suffix removal in partition definition: + + catfasta2phyml.pl -b.fas dat/file1.fas dat/file2.fas > out.phy + =head1 AUTHOR @@ -579,12 +607,12 @@ =head1 AUTHOR =head1 DEPENDENCIES -Uses Perl modules Getopt::Long and Pod::Usage +Uses Perl modules Getopt::Long, Pod::Usage, File::Basename; =head1 LICENSE AND COPYRIGHT -Copyright (c) 2010-2022 Johan Nylander +Copyright (c) 2010-2024 Johan Nylander Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal