#!/usr/bin/perl -w # # copy-by-attr # # copy a list of files into a set of directories named according # to attributes of the files which are being copied. # # some use cases: # + copying photos from digital camera memory cards # into a directory hierarchy based on the dates the images # were taken, all in one go # # + creating a set of directories containing links to files # based on their size, owner and suffix # # # usage: # [options] sources... target-root # # where: # sources... a list of files to be copied (or linked to), # # target-root the top-level directory of the place where files # will be copied to # # and options can be any of the *attribute* options: # # --date -- use modification date to group files # -- format is yyyy-mm-dd # (this is the default if no attribute options are specified) # # --year -- use modification year to group files # -- format is yyyy # # --hour -- use modification hour to group files # -- format is hh # # --suffix -- use file suffix to group files # -- files with no suffix get grouped under 'NO-SUFFIX' # # --size -- use size to group files # -- directories are named according to # empty,1k,10k,100k,1M,10M,100M,1G,huge # and each file is stashed into the smallest group # that would accommodate it completely # # --user -- use user name to group files # # --group -- use group name to group files # # # and any of the *behaviour* options: # # --symlink -- only symlink to source items, rather than copying them # # --attrname -- include in the generated path names a component that # names the class of attribute; so, copying by date to # /foo would create subdirs such as /foo/by-date/yyyy-mm-dd/; # this can be useful where it is possible for attribute # values to be duplicated between sets (such as using # --hour and --suffix, and one of the suffixes happened # to be '10', for example) # # --tgtfirst -- provide the target directory *before* the source arguments # (handy for when you want the program to be invoked from # something like xargs or the --exec option of find) # # ################################################################ # # the target-root is treated as the root of a tree of sub-directories, # each named according to the rules for an attribute, and the # individual copied files are put into the corresponding subdirectory. # # so, if you copy a bunch of files from a camera, which are named # R1234567.JPG, (where 1234567 is some arbitrary number, different # for each image), and you specify that they be copied by date to # /foo, then each image will end up in a directory such as # /foo/2008-01-02/R1234567.JPG, where '2008-01-02' is the modification # date of file R1234567.JPG. # ################################################################ # # # The default is to look at the end of the arg list for the target # directory, but this can cause problems if you want to invoke # this from xargs, for example. So, you may pass the target-root # on the command line *before* the source files by using the --tgtfirst # option -- this is contrary to convention, but allows using xargs for # marshalling lots of files, perhaps as the result of a find. # # # # Note that if no *attribute* options are passed, then 'date' is presumed. # # If you do specify any attribute options, and also wish 'date' to be # included, you must specifiy it explicitly. # # # Note that, if you specify more than one *attribute* option, then # at most one copy of each file will be made; all other attribute options # will be satisfied by using symlinks. You can, of course, make multiple # copies by just invoking this command multiple times, once for each # attribute option. (And, no, at present you cannot specify which # attribute option is the copy, and which are the symlinks.) # # # Behaviour options are independent of the attribute options, and their # presence or absence has no impact on how the attribute options are handled. # ################################################################ # # Obvious enhancements: # # + make it process a list of files piped in, rather than explicitly handed # to it on the command line. This is only a minor enhancement, since I've # deliberately allowed for 'target-root' to be at the start of the arglist # to allow things like xargs to invoke this cleanly. # # + make the attribute(s) of each file that affect how they're copied/linked # based on an external process (or an expression, but that could be nightmarish # compared with having an external task with a simple calling convention); # I'm specifically not succumbing to the temptation to do this just now, because # there is no specific problem that I have that needs it, and I can see it being # the start of several complete re-writes, culminating in some dynamically- # generated, database-backed file system layer that wins smart-arsery awards # while solving no existing problems any better than this script does. # # ################################################################ # # Scratched into existence by Frank Wales (frank@limov.com) due to a # colossally itchy pile of memory cards whose contents needed filing by date. # ################################################################ # # What, you want a version number? Aw, for fuff's sake, it's # just a little script. # # All right then, this is version Pink-and-fluffy-Slash-Apple-Point-Five. # # Happy now? # ################################################################ # # Changes? I've had a few. But then again, too few to mention. # # (Except for that one where I put the target root back at the # end of the arglist, and added the --tgtfirst option. # Oh, and switched from gmtime to localtime to make the # date-based options more intuitive for pics taken near midnight.) # ################################################################ # # You might as well have some permission to copy and change it too, # I guess, so this program is: # # Copyright Limitless Innovations 2008 # # and is licensed under your choice of: # # + the GNU General Public License version 2 or later, a copy of which is not attached # + the deal we did in the off-licence that you signed with your own blood # + some other arrangement, for which you are welcome to contact # Frank Wales, being frank@limov.com # along with your proposals for exchange of consideration, # meeting of minds, et cetera, ad infinitum, dona nobile, allons-y. # # ################################################################ # # And now, our Feature Presentation # use strict; use warnings; use File::Basename; use Getopt::Long; use Cwd; my $do_symlink =''; my $do_attrname=''; my $do_tgtfirst=''; my $HERE=cwd; ################################################################ # # some path-naming utilities -- all expect to be passed in # the path name of the source, as passed in on the command line, # and the output of stat on that same file as a 13-element array # # (we could stat the thing, given its name, but since the # caller has to stat it anyway, why hit the file system # twice for every file?) # ################################################################ ################################################################ # # modification date -- currently uses hard-coded format; # suggestions welcome for ways to customize this that aren't # hideously complicated -- I guess the POSIXly-correct way # would involve $LC_/$LANG-type stuff, but I generally hate # those for doing almost the wrong thing in nearly the right way. # # I suppose I'm willing to be persuaded, though... # sub mdate_path_for($@) { my $filename=shift; my @ltm=localtime($_[9]); return sprintf('%04d-%02d-%02d',$ltm[5]+1900,$ltm[4]+1,$ltm[3]); } sub myear_path_for($@) { my $filename=shift; my @ltm=localtime($_[9]); return $ltm[5]+1900; # you don't really want a sprintf here, do you? } sub mhour_path_for($@) { my $filename=shift; my @ltm=localtime($_[9]); return sprintf("%02d",$ltm[2]); } # # I think that's enough time-based examples for you to be able # to cons up your own extra ones if you need them # ################################################################ ################################################################ # # this seems to me to be quite ugly -- suggestions welcome for # making grouping by size more useful; in particular, making # the step sizes more intuitive would be good. # sub size_path_for($@) { my $filename=shift; my $filesize=$_[7]; # yes, it could be a loop and a hash, but that's really # just putting a fancy hat on what's still a horse's butt if ($filesize==0 ) { return 'empty'; } elsif ($filesize<1024 ) { return '1k' ; } elsif ($filesize<10240 ) { return '10k' ; } elsif ($filesize<102400 ) { return '100k' ; } elsif ($filesize<1048576 ) { return '1M' ; } elsif ($filesize<10485760 ) { return '10M' ; } elsif ($filesize<104857600 ) { return '100M' ; } elsif ($filesize<1073741824) { return '1G' ; } else { return 'huge' ; } } ################################################################ # # falls back to BIG-FUGLY default that is highly unlikely # ever to be used as an actual suffix # my $no_suffix='NO-SUFFIX'; sub suffix_path_for($@) { my $filename=shift; # this is the only one that currently needs the filename if ($filename=~m(\.([^./]*)$)) { # arbitrarily specify suffix as all that # follows the last '.' as long as it doesn't include a '/' return $1; } else { return $no_suffix; } } ################################################################ # # uses trivial fall-back when UID is not known locally # sub user_path_for($@) { my $filename=shift; my $uid=$_[4]; return getpwuid($uid) || "uid=$uid"; } ################################################################ # # uses trivial fall-back when GID is not known locally # sub group_path_for($@) { my $filename=shift; my $gid=$_[5]; return getgrgid($gid) || "gid=$gid"; } # the keys for this hash define the names of the # attribute options that we can be invoked with my %path_for=( date => \&mdate_path_for, year => \&myear_path_for, hour => \&mhour_path_for, suffix => \&suffix_path_for, size => \&size_path_for, user => \&user_path_for, group => \&group_path_for ); my $default_attr='date'; my %invoked_with=(); # start with the behaviour options my %options=( symlink => \$do_symlink, attrname => \$do_attrname, tgtfirst => \$do_tgtfirst ); # then stuff in the attribute options, based on the # functions we've associated with %path_fors foreach my $optname (keys %path_for) { $invoked_with{$optname}=''; $options{$optname}=\$invoked_with{$optname}; } GetOptions(%options); # after that, only non-command-line options ought to remain in @ARGV if ($#ARGV<1) { print STDERR $0,": too few arguments\n usage: $0 [options] source[...] target-root\n or: $0 [options] --tgtfirst target-root source[...]\n where options can be any of:\n"; foreach my $opt (sort keys %options) { print STDERR " --$opt\n"; } exit 2; } unless (grep /1/, @invoked_with{keys %path_for}) { $invoked_with{$default_attr}=1; } my $target; if ($do_tgtfirst) { # take the *first* available non-option argument, which is contrary to # standard copying conventions, but enables xargs-type shenanigans $target=shift(@ARGV); } else { # take the last available non-option argument $target=pop(@ARGV); } # if $target ends in a / (which might have been put there by the # shell if markdirs is turned on), then strip it, if only to # keep tidy any symlinks that we generate chop $target if substr($target,-1) eq '/'; my @sources=@ARGV; # now, prep the target area unless (-d $target && -w $target) { print STDERR $0,": cannot write to target directory '",$target,"'\n"; exit 1; } my %attr_paths=(); foreach my $attr (keys %path_for) { $attr_paths{$attr}=$do_attrname?"by-$attr/":""; } # we allow multiple attribute copies, why not? # # however, in this case, if we've been asked to copy, # we only copy to the first attribute, and symlink the # rest to it # # if we've been explicitly asked to symlink, we symlink # everything and copy nothing # # note that, at present, we don't have a way of specifying # which attribute should be the copied one, to which all # the other will symlink # # (also, the symlinking is dumb, in that it uses the # full path name, rather than the shortest relative pathname, # which would make the target tree re-parentable -- feel free # to fix this egregious oversight if it matters to you) # SOURCE: foreach my $src (@sources) { my @srcstat=stat $src; my $src_basename=basename($src); unless ($#srcstat==12) { print STDERR $0,": cannot stat '$src' -- $!\n"; next SOURCE; } my @targets=(); for my $optname (keys %path_for) { if ($invoked_with{$optname}) { push (@targets,$attr_paths{$optname}.$path_for{$optname}($src,@srcstat)); mkdir "$target/$attr_paths{$optname}",0775 unless -d "$target/$attr_paths{$optname}"; } } my $copy=''; $copy=shift(@targets) unless $do_symlink; my $copypath=''; my $copytgt=''; # do the copy if ($copy) { $copypath="$target/$copy"; mkdir $copypath,0775 unless -d $copypath; unless (-d $copypath) { print STDERR $0,": cannot create directory '$copypath' -- $!\n"; next SOURCE; } # we don't attempt to be recursive -- we depend on the invoker # to do that for us, perhaps with find or something, instead # # we are lenient about copying from symlinks, though unless (-f $src || -l $src) { print STDERR $0,": skipping non-copyable item '$src'\n"; next SOURCE; } unless (open(IN,"<",$src)) { print STDERR $0,": cannot open '$src' for reading -- $!\n"; next SOURCE; } $copytgt="$copypath/$src_basename"; unless (open(OUT,">",$copytgt)) { print STDERR $0,": cannot open '$copytgt' for writing -- $!\n"; close IN; next SOURCE; } # maybe we should check for write failures, eh? while() { unless (print OUT) { print STDERR $0,": error writing to '$copytgt' -- $!\n"; close OUT; close IN; exit 1; # write failure is serious enough to scrub the mission } } close OUT; close IN; # haven't decided if it's worth whining about failures of these, # since many reasonable copies will actually fail for some or all of # these, because of system policies (can't chown), or file system # limitations (VFAT, for example) chmod $srcstat[2], $copytgt; utime $srcstat[8],$srcstat[9],$copytgt; chown $srcstat[4],$srcstat[5],$copytgt; } foreach my $link (@targets) { my $linkpath="$target/$link"; mkdir $linkpath,0775 unless -d $linkpath; unless (-d $linkpath) { print STDERR $0,": cannot create directory '$linkpath' -- $!\n"; next SOURCE; } if ($copytgt) { symlink $copytgt,"$linkpath/$src"; } else { # need the full path to $src, really if ($src=~/^\//) { symlink $src,"$linkpath/$src"; } else { symlink "$HERE/$src","$linkpath/$src"; } } } } exit 0;