 |
#!/usr/bin/perl
#
# mount.restfs: mounts a "web filesystem" using fuse
#
# REQUIRED non-standard modules: Fuse, LWP::UserAgent, URI
# OPTIONAL modules: XML::XPath, HTML::TreeBuilder, JSON, CSS
#
# assuming mount on /mnt, paths look like:
#
# /mnt/<encoded url>/<info>
#
# where <encoded url> is a url with , replaced with %2C then / replaced with ,
# so, e.g. /mnt/en.wikipedia.org,w,api.php?action=parse&page=Kittens
# (shell escaping is left as an exercise to the reader)
# (you can leave off the http:,, if it's not https)
#
# <info> consists of the following shared files:
# content: result of the request
# url: decoded url of the request for convenience
# headers: response headers
# additionally, types text/plain and text/html have:
# links/: a directory of symlinks named (uniquely) after their related text
# additionally, types text/html, text/xml, and application/json have:
#
# /mnt/<encoded-url>/tree/<tree-path>
#
# <tree-path> varies depending on the encoding, but for JSON each path
# component corresponds to an object property (numbers for array indices),
# and for html & xml it corresponds to an XPath
#
# Right now the only way to unmount is "fusermount -u /path/to/mntpt" which
# will also make the script exit; ^C or killing the pid won't unmount the fs!
#
# TODO: threading!
## Standard Modules
use strict;
use warnings;
use Getopt::Std;
use File::Basename;
use POSIX qw(:errno_h :fcntl_h ceil);
use Encode;
use List::Util 'first';
use Digest::MD5 'md5_base64';
use threads;
use threads::shared;
## Required Modules
use Fuse;
use LWP::UserAgent;
use URI;
## Optional Modules
my $grok_css = eval { require CSS };
my $grok_json = eval { require JSON };
my $grok_yaml = eval { require YAML };
my $grok_xpath = eval { require XML::XPath };
my $grok_html = $grok_xpath
&& eval { require HTML::TreeBuilder };
## Setup and Options
my $prog = basename($0);
my %opt;
getopts('hd', \%opt);
usage() if $opt{h} || @ARGV != 1;
sub usage {
die <<EOU;
usage: $prog [-d] mount-point
-d: daemonize; default is to run in foreground with debugging
EOU
}
## Daemonizing
if ($opt{d}) {
close(STDOUT);
close(STDIN);
close(STDERR);
fork and exit;
}
## Global var initialization
my $JSON = JSON->new->allow_nonref(1);
my $ua = LWP::UserAgent->new(agent => 'Firefox'); # TODO: add sexy useragent
my %hcache :shared; # TODO: replace with Tie::Cache::LRU or equiv.
my %ocache; # TODO: replace with Tie::Cache::LRU or equiv.
my $mount = $ARGV[0]; $mount =~ s,/+$,,;
die "No such directory $mount\n" unless -d $mount;
## Extensible handler configuration
my %handlers = (
# links handlers take content and return a hash of name => links
# names must (obviously) be unique, and links may be relative strings
links => {
qr{^text/plain$}i => \&find_text_links,
qr{html$}i => \&find_html_links,
qr{/css$}i => \&find_css_links,
},
# tree handlers take content and path parts and return a valid info hashref
# or undef
tree => {
qr{/(javascript|(x-)?json)$}i => $grok_json && \&build_json_tree,
qr{xml$}i => $grok_xpath && \&build_xml_tree,
qr{html$}i => $grok_html && \&build_html_tree,
qr{/(x-)?yaml$}i => $grok_yaml && \&build_yaml_tree,
qr{/css$}i => 0 && $grok_css && \&build_css_tree,
},
);
## Run filesystem
Fuse::main(
mountpoint => $mount,
debug => 0,
threaded => 1,
mountopts => 'allow_other',
'getattr' => 'main::rfs_getattr',
'readlink' => 'main::rfs_readlink',
'getdir' => 'main::rfs_getdir',
'open' => 'main::rfs_open',
'read' => 'main::rfs_read',
);
exit; # never reached?
## Filesystem calbacks
sub rfs_getattr {
my $info = parse_path(@_) || return -ENOENT();
my $mtime = $info->{mtime} || time();
my $contents = file_contents($info);
my $size = defined($contents) ? length($contents) : 0;
my $blksize = 1024 * 1024;
my $blocks = ceil($size / $blksize);
my $type;
if ($info->{dir}) {
$type = 0040;
}
elsif ($info->{link_to}) {
$type = 0120;
}
elsif (defined $contents) {
$type = 0100;
}
else {
warn "unknown reponse from parse_path()\n";
return -ENOENT();
}
my $mode = 0444 + ($type << 9); # world-readable + type of file
(0, 0, $mode, 1, 0, 0, 0, $size, time(), $mtime, $mtime, $blksize, $blocks);
}
sub rfs_readlink {
my $info = parse_path(@_);
$info && $info->{link_to};
}
sub rfs_getdir {
my $info = parse_path(@_) || return -ENOENT();
return -ENOTDIR() unless $info->{dir};
('.', '..', map { encode_name($_) } @{$info->{dir}}, 0);
}
sub rfs_open {
my ($path, $flags) = @_;
my $info = parse_path($path, 1);
# TODO: check flags
return 0 if $info && !$info->{dir};
return -EACCES();
}
sub rfs_read {
my ($path, $bytes, $off) = @_;
my $info = parse_path($path) || return -ENOENT();
my $contents = file_contents($info) || return -ENOENT();
substr($contents, $off, $bytes);
}
## Utility functions
sub file_contents {
my ($info) = @_;
my $file = $info->{file};
return undef unless defined $file;
Encode::is_utf8($file) ? encode('utf-8', $file) : $file;
}
sub parse_path {
my ($path, $open) = @_;
$path =~ s{(?<=[^/])/+$}{}; # trim off trailing /s
# top level dir: show our cached urls shortened (handy)
return { dir => [map { s,^http://,,; $_ } keys %hcache] }
if $path eq '/';
# skip system .file queries
return undef if $path =~ m{^/\.};
# turn /foo/ba,r/baz -> ('foo', 'ba/r', 'baz')
my @parts = map { decode_name($_) } split(/\//, substr($path, 1));
# decode url and normalize
my $url = shift(@parts);
$url = "http://$url" unless $url =~ m{^[a-z]+:};
# next component is optional cmd
my $cmd = shift(@parts) || '';
# perform fetch and set vars
my $res = fetch_cached($url, $open && $cmd eq 'content');
my $mtime = $res->last_modified || time();
my $content = $res->decoded_content;
my $type = $res->header('Content-type'); $type =~ s/\s*;.*//;
if (!$cmd) {
my @kids = qw(headers content url); # shared virtual files/dirs
push(@kids, 'links') if handler_for('links', $type);
push(@kids, 'tree') if handler_for('tree', $type);
return { dir => \@kids, mtime => $mtime };
}
elsif ($cmd eq 'content') {
return { file => $content, mtime => $mtime };
}
elsif ($cmd eq 'url') {
return { file => "$url\n", mtime => $mtime };
}
elsif ($cmd eq 'headers') {
my $which = shift(@parts);
if (defined $which) {
my $hdr = $res->header($which);
return undef unless defined $hdr;
my $info = file_or_link_to($hdr);
$info->{mtime} = $mtime;
return $info;
}
else {
return { dir => [ $res->header_field_names ], mtime => $mtime };
}
}
elsif ($cmd eq 'links') {
my $which = shift(@parts);
my $handler = handler_for('links', $type) || return undef;
my @links = $handler->($content);
# turn ([ foo => bar ], [ foo => baz ], [ quux => garply ])
# into ( foo => bar, foo2 => baz, quux => garply )
my %links;
for (@links) {
my ($name, $link) = @$_;
my ($base, $n);
for ($base = $name, $n = 2; exists $links{$name}; $name = $base.($n++)) {}
$links{$name} = $link;
}
undef @links;
if (defined $which) {
my $lto = link_to($links{$which}, $url) || return undef;
return { link_to => $lto, mtime => $mtime };
}
else {
return {
dir => [ grep { link_to($links{$_}, $url) } keys %links ],
mtime => $mtime
};
}
}
elsif ($cmd eq 'tree') {
my $handler = handler_for('tree', $type) || return undef;
my $info = $handler->($content, $url, @parts) || return undef;
$info->{mtime} = $mtime;
return $info;
}
}
sub decode_name {
my ($str) = @_;
$str =~ s#,#/#g;
$str =~ s/\%2C/,/ig;
$str;
}
sub encode_name {
my ($url) = @_;
$url =~ s/,/\%2C/g;
$url =~ s#/#,#g;
$url;
}
sub fetch_cached {
my ($url, $force) = @_;
my $res;
{
lock(%hcache); # TODO finer-grained locking
unless ($force) {
my $msg = $hcache{$url};
$res = HTTP::Response->parse($msg) if $msg;
}
unless ($res) {
print "GET $url\n" unless $opt{d};
$res = $ua->get($url);
$hcache{$url} = $res->as_string;
}
}
$res;
}
sub link_to {
my ($str, $base) = @_;
$str = URI->new_abs($str, $base) if $base;
return undef unless $str =~ m{^https?://\S+$};
return $mount . '/' . encode_name($str);
}
sub file_or_link_to {
my ($str, $base) = @_;
my $lto = link_to($str, $base);
return { link_to => $lto } if $lto;
return { file => "$str\n" };
}
sub parse_css {
my ($css) = @_;
ocache($css, sub {
my $obj = new CSS;
$obj->read_string($_[0]);
$obj;
});
}
sub parse_xml {
my ($xml) = @_;
ocache($xml, sub { XML::XPath->new(xml => $_[0]) });
}
sub html2xml {
my ($html) = @_;
ocache($html, sub { HTML::TreeBuilder->new_from_content($_[0])->as_XML });
}
sub ocache {
my ($content, $gen) = @_;
$ocache{md5_base64(encode('utf-8', $content))} ||= eval { $gen->($content) };
}
sub handler_for {
my ($thing, $type) = @_;
my $key = (first { $type =~ $_ } keys(%{$handlers{$thing}})) || return undef;
$handlers{$thing}{$key};
}
## Handler implementations
sub find_text_links {
my ($content) = @_;
return map { [ URI->new($_)->host => $_ ] }
sort ($content =~ m{\bhttp://[^\s"'<>]+}g);
}
sub find_html_links {
my ($html) = @_;
my @links;
# good xpathy version
if ($grok_html) {
my $obj = parse_xml(html2xml($html));
@links = map {
my $name = $_->string_value;
$name =~ s/^\s+|\s+$//g;
$name = 'untitled' unless length $name;
[ $name => scalar($_->getAttribute('href')) ];
} eval { $obj->findnodes('//a[@href]') };
}
# failover regexpy version
else {
while ($html =~ /<a [^>]*\bhref=["']?([^"'>]+)[^>]*>([^<]+)/g) {
my ($href, $name) = ($1, $2);
$name =~ s/^\s+|\s+$//g;
push(@links, [ $name => $href ]) if length $name && $href;
}
}
@links;
}
sub build_html_tree {
my ($html, $url, @parts) = @_;
build_xml_tree(html2xml($html), $url, @parts);
}
sub find_css_links {
my ($css) = @_;
return () unless $css =~ /url\(/;
my @links;
if ($grok_css) {
my $obj = parse_css($css);
for my $style (@{$obj->{styles}}) {
for my $prop (@{$style->{properties}}) {
for my $val (@{$prop->{'values'}}) {
$val->{value} =~ /url\((.+)\)/ || next;
my $url = $1;
for my $sel (map { $_->{name} } @{$style->{selectors}}) {
$sel =~ s,/\*.*\*/,,g;
$sel =~ s/^\s+|\s+$//g;
push(@links, [ $sel => $url ]);
}
}
}
}
}
else {
@links = map { [ url => $1 ] } ($css =~ /url\((.+)\)/g);
}
# common cleanup
for (@links) {
$_->[1] =~ s/^(['"])(.+)\1$/$2/;
$_->[1] =~ s/\\\\/\\/g;
$_->[1] =~ s/\\([(),\s'"])/$1/g;
}
@links;
}
sub build_css_tree {
my ($css, $url, @parts) = @_;
}
sub build_json_tree {
my ($json, $url, @parts) = @_;
my $obj = ocache($json, sub { JSON::from_json($_[0]) }) || return undef;
build_obj_tree($obj, $url, sub {
my $str = $JSON->encode($_[0]);
$str =~ s/^"|"$//g;
$str;
}, @parts);
}
sub build_yaml_tree {
my ($yaml, $url, @parts) = @_;
my $obj = ocache($yaml, sub { YAML::Load($_[0]) }) || return undef;
build_obj_tree($obj, $url, undef, @parts);
}
sub build_obj_tree {
my ($obj, $url, $output, @parts) = @_;
$output ||= sub { $_[0] };
for (@parts) {
if (ref($obj) eq 'HASH') {
$obj = $obj->{$_};
}
elsif (ref($obj) eq 'ARRAY') {
$obj = $obj->[$_];
}
else {
return undef;
}
}
if (ref($obj) eq 'HASH') {
return { dir => [ keys %$obj ] };
}
elsif (ref($obj) eq 'ARRAY') {
return { dir => [ 0..$#$obj ] };
}
else {
return file_or_link_to($output->($obj));
}
}
sub build_xml_tree {
my ($xml, $url, @parts) = @_;
my $path = '/' . join('/', @parts);
my $xp = parse_xml($xml) || return undef;
my $ns = eval { $xp->find($path) };
return undef unless $ns && $ns->size;
my $node = $ns->get_node(1);
if ($node->isa('XML::XPath::Node::Element')) {
my (%kids, %count);
for my $kid ($node->getChildNodes()) {
if (my $name = $kid->getName) {
if ($count{$name}++) {
if ($count{$name} == 2) {
$kids{$name.'[1]'} = 1;
delete $kids{$name};
}
$kids{$name.'['.$count{$name}.']'} = 1;
}
else {
$kids{$name} = 1;
}
}
else {
$kids{'text()'} = 1;
}
}
return { dir => [
keys(%kids),
map { '@' . $_->getName } $node->getAttributes()
]};
}
elsif ($node->isa('XML::XPath::Node::Attribute')) {
return file_or_link_to($node->getNodeValue,
($node->getName =~ /^(href|src)$/i) && $url);
}
elsif ($node->isa('XML::XPath::Node::Text')) {
return file_or_link_to($node->getValue);
}
else {
return undef;
}
}
|
 |