Plagger で RSS1.0 を出力するモジュール
plaggerで新着図書一覧ページから、RSS1.0でdcモジュールを使って見たかったので、作って見ました。
テスト環境が、ActivePerl+XPなので、それ以外の環境ではディレクトリの指定など適宜修正してください。
コーディングとしては手抜きの箇所が多々ありますが、一応動くので、載せておきます。
テスト環境が、ActivePerl+XPなので、それ以外の環境ではディレクトリの指定など適宜修正してください。
コーディングとしては手抜きの箇所が多々ありますが、一応動くので、載せておきます。
作成・修正が必要なモジュール
- Plagger::Plugin::Publish::Feed2;
- Plagger::Plugin::Filter::EntryFullText;
- Plagger::Entry;
- Plagger::Plugin::SmartFeed;
yamlの例
- config.yamlの例
- /Plagger/assets/plugins/Filter-EntryFullTextのhit-u.yamlの例
モジュールの修正箇所
- Plagger::Plugin::Publish::Feed2;
package Plagger::Plugin::Publish::Feed2; use strict; use base qw( Plagger::Plugin ); our $VERSION = 0.01; use XML::Feed; use XML::Feed::Entry; use XML::RSS::LibXML; use File::Spec; use XML::RSS; $XML::Feed::RSS::PREFERRED_PARSER = "XML::RSS::LibXML"; sub register { my($self, $context) = @_; $context->register_hook( $self, 'publish.feed' => \&publish_feed, ); $self->init_feed($context); } sub init_feed { my($self, $context) = @_; # check dir my $dir = $self->conf->{dir}; unless (-e $dir && -d _) { mkdir $dir, 0755 or $context->error("mkdir $dir: $!"); } unless (exists $self->conf->{full_content}) { $self->conf->{full_content} = 1; } } sub publish_feed { my($self, $context, $args) = @_; my $conf = $self->conf; my $f = $args->{feed}; my $feed_format = $conf->{format} || 'Atom'; local $XML::Atom::DefaultVersion = "1.0"; # generate ATOM feed my $feed = XML::Feed->new($feed_format); $feed->title($f->title); $feed->link($f->link); $feed->modified(Plagger::Date->now); $feed->generator("Plagger/$Plagger::VERSION"); $feed->description($f->description || ''); $feed->author($f->author) if $f->primary_author; if ($feed_format eq 'Atom') { $feed->{atom}->id("tag:plagger.org,2006:" . $f->id); } # generate RSS 1.0 my $rss = new XML::RSS (version => '1.0'); $rss->channel( title => $f->title, link => $f->link, description => $f->description, dc => { date => Plagger::Date->now, subject => "Linux Software", creator => $f->author, publisher => "Plagger/$Plagger::VERSION", rights => 'Copyright 1999, Freshmeat.net', language => 'en-us', }, syn => { updatePeriod => "hourly", updateFrequency => "1", updateBase => "1901-01-01T00:00+00:00", }, taxo => [ 'http://dmoz.org/Computers/Internet', 'http://dmoz.org/Computers/PC' ] ); # create entries for my $e ($f->entries) { # create RSS 1.0 entries if ($feed_format eq 'RSS') { $rss->add_item( title => $e->title, link => $e->permalink, description => $e->body_text, dc => { publisher => $e->publisher, subject => $e->tags, creator => $e->author, language => $e->language, date => $e->date, } ); } # if # create Atom entries else { # add entry ## for my $e ($f->entries) { my $entry = XML::Feed::Entry->new($feed_format); $entry->title($e->title); $entry->link($e->permalink); $entry->summary($e->body_text) if defined $e->body; # hack to bypass XML::Feed Atom 0.3 crufts (type="text/html") if ($self->conf->{full_content} && defined $e->body) { if ($feed_format eq 'RSS') { $entry->content($e->body); } else { $entry->{entry}->content($e->body); } } # if ($self->conf $entry->category($e->tags) if $e->tags; $entry->issued($e->date) if $e->date; $entry->modified($e->date) if $e->date; $entry->id("tag:plagger.org,2006:" . $e->id); # tenuki # $entry->category(join(' ', @{$e->tags})); if ($e->has_enclosure) { for my $enclosure (grep { defined $_->url && !$_->is_inline } $e->enclosures) { $entry->add_enclosure({ url => $enclosure->url, length => $enclosure->length, type => $enclosure->type, }); # RSS 2.0 by spec doesn't allow multiple enclosures last if $feed_format eq 'RSS'; } # for my $enclosure } # if ($e->has_enclosure $feed->add_entry($entry); } } # else # generate file path my $filepath = File::Spec->catfile($self->conf->{dir}, $self->gen_filename($f)); $context->log(info => "save feed for " . $f->link . " to $filepath" . $feed_format); # generate RSS file if ($feed_format eq 'RSS') { my $xml = $rss->as_string; utf8::decode($xml) unless utf8::is_utf8($xml); open my $output, ">" , $filepath or $context->error("$filepath: $!"); print $output $xml; close $output; } else { my $xml = $feed->as_xml; utf8::decode($xml) unless utf8::is_utf8($xml); open my $output, ">:utf8", $filepath or $context->error("$filepath: $!"); # open my $output, $filepath or $context->error("$filepath: $!"); print $output $xml; close $output; } } my %formats = ( 'u' => sub { my $s = $_[0]->url; $s =~ s!^https?://!!; $s }, 'l' => sub { my $s = $_[0]->link; $s =~ s!^https?://!!; $s }, 't' => sub { $_[0]->title }, 'i' => sub { $_[0]->id }, ); my $format_re = qr/%(u|l|t|i)/; sub gen_filename { my($self, $feed) = @_; my $file = $self->conf->{filename} || '%i.' . ($self->conf->{format} eq 'RSS' ? 'rss' : 'atom'); $file =~ s{$format_re}{ $self->safe_filename($formats{$1}->($feed)) }egx; $file; } sub safe_filename { my($self, $path) = @_; $path =~ s![^\w\s]+!_!g; $path =~ s!\s+!_!g; $path; } # XXX okay, this is a hack until XML::Feed is updated *XML::Feed::Entry::Atom::add_enclosure = sub { my($entry, $enclosure) = @_; my $link = XML::Atom::Link->new; $link->rel('enclosure'); $link->type($enclosure->{type}); $link->href($enclosure->{url}); $link->length($enclosure->{length}); $entry->{entry}->add_link($link); }; *XML::Feed::Entry::RSS::add_enclosure = sub { my($entry, $enclosure) = @_; $entry->{entry}->{enclosure} = { url => $enclosure->{url}, type => $enclosure->{type}, length => $enclosure->{length}, }; }; 1; __END__
- Plagger::Plugin::Filter::EntryFullText;
14行目付近への追加
use Plagger::Tag;
149行目付近への追加
$args->{entry}->tags(Plagger::Tag->parse($data->{tags})) if $data->{tags}; # $args->{entry}->tags($data->{tags}) if $data->{tags}; $args->{entry}->author($data->{author}) if $data->{author}; $args->{entry}->publisher($data->{publisher}) if $data->{publisher}; $args->{entry}->language($data->{language}) if $data->{language};
- Plagger::Entry;
5行目付近
__PACKAGE__->mk_accessors(qw( title author tags link feed_link id summary body rate icon meta source publisher language));
103行目付近(1;の手前)
sub add_publisher { my $self = shift; $self->publisher; } sub add_language { my $self = shift; $self->langage; } sub add_tags { my $self = shift; $self->tags; }
- Plagger::Plugin::SmartFeed;
$feed->description( $self->conf->{description} || $feed->title ); $feed->author( $self->conf->{author} || $feed->title );
- config.yamlの例
global: plugin_path: - D:\Perl\site\lib\Plagger\Plugin assets_path: D:\Perl\site\lib\Plagger\assets timezone: Asia/Tokyo log: level: debug cache: base: c:\plagger # feedで指定したURLがある*.yamlを探し、データを取得しフィルタします # urlをいじったときはEFTのyamlも修正する。 plugins: - module: Subscription::Config config: feed: - url: http://opac.lib.hit-u.ac.jp/opac-new/book1/8.html # - url: http://opac.lib.hit-u.ac.jp/opac-new/book1/new.html # - url: http://opac.lib.hit-u.ac.jp/opac-new/book2/new.html meta: follow_link: /opac/books-query\?mode=2\&place=\&code=\d+ # ?と&はエスケープしないといけないらしい。 # http://shakenbu.org/yanagi/d/20060923.html # EntryFullText で custom_feed_handle と custom_feed_follow_link / custom_feed_follow_xpath を指定した場合、 CustomFeed::Simple はいらない。 # - module: CustomFeed::Config # - module: CustomFeed::Simple # データの重複を排除します # - module: Filter::Rule # rule: # module: Deduped # path: C:\plagger\dedupe-hit-u.db - module: SmartFeed rule: module: Fresh mtime: path: C:\plagger\tmp\foo.tmp autoupdate: 1 config: title: 一橋大学附属図書館新着図書 link: http://opac.lib.hit-u.ac.jp/opac-new/book1/8.html description: 一橋大学附属図書館の新着図書リストからRSS1.0を生成します。 author: 自分のお名前 # もしかするとEFTとCustomFeed::Configは両立しない? # 一致するものがなかった場合、<content:endoded>に入れるかどうか - module: Filter::EntryFullText config: store_html_on_failure: 0 # 取得したファイルからHTMLタグを除去します - module: Filter::HTMLScrubber - module: Publish::Feed2 config: format: RSS dir: c:\plagger filename: rss-hit3.xml
- /Plagger/assets/plugins/Filter-EntryFullText のhit-u.yaml
author: wono custom_feed_handle: http://opac.lib.hit-u.ac.jp/opac-new/book1/8.html custom_feed_follow_link: /opac/books-query\?mode=2\&place=\&code=\d+ handle: http://opac\.lib\.hit-u\.ac\.jp/opac/books-query\?mode=2\&place=\&code=\d+ extract: <H1 CLASS="TR">(.*?)</H1>.*?<SPAN CLASS="PUBL" ID="VALUE">(.*?)</SPAN>.+?<SPAN CLASS="AL" ID="VALUE"><A HREF="/opac/disp-query\?mode=2\&con1=3\&kywd1=[\%a-zA-Z0-9]+\&con2=3\&con3=4\&disp=1">(.*?)</A>.*?<SPAN CLASS="CLS" ID="VALUE">(.*?)</A>.*?<SPAN CLASS="TXTL" ID="VALUE">(.*?)</SPAN> extract_capture: title publisher author tags language extract_after_hook: | if ($data->{publisher} eq "") {$data->{publisher} = "test"}; $data->{body} = $data->{publisher} . " author:" . $data->{author} . " lang:" . $data->{language}; if (!$data->{tags}) { $data->{tags} = "ZZZZZZZZZZZZZZZZZZ"; } else { $data->{tags} =~ s/\<[A-Za-z0-9 \%\/\?\-\&\=\"]+\>//ig; }