※上記の広告は60日以上更新のないWIKIに表示されています。更新することで広告が下部へ移動します。

Plagger で RSS1.0 を出力するモジュール

 plaggerで新着図書一覧ページから、RSS1.0でdcモジュールを使って見たかったので、作って見ました。
 テスト環境が、ActivePerl+XPなので、それ以外の環境ではディレクトリの指定など適宜修正してください。
 コーディングとしては手抜きの箇所が多々ありますが、一応動くので、載せておきます。

作成・修正が必要なモジュール

  • Plagger::Plugin::Publish::Feed2;
  • Plagger::Plugin::Filter::EntryFullText;
  • Plagger::Entry;
  • Plagger::Plugin::SmartFeed;

yamlの例

  • config.yamlの例
  • /Plagger/assets/plugins/Filter-EntryFullTextのhit-u.yamlの例

モジュールの修正箇所

  • Plagger::Plugin::Publish::Feed2;
package Plagger::Plugin::Publish::Feed2;

use strict;
use base qw( Plagger::Plugin );

our $VERSION = 0.01;

use XML::Feed;
use XML::Feed::Entry;
use XML::RSS::LibXML;
use File::Spec;
use XML::RSS;

$XML::Feed::RSS::PREFERRED_PARSER = "XML::RSS::LibXML";

sub register {
    my($self, $context) = @_;
    $context->register_hook(
        $self,
        'publish.feed' => \&publish_feed,
    );
    $self->init_feed($context);
}

sub init_feed {
    my($self, $context) = @_;

    # check dir
    my $dir = $self->conf->{dir};
    unless (-e $dir && -d _) {
        mkdir $dir, 0755 or $context->error("mkdir $dir: $!");
    }

    unless (exists $self->conf->{full_content}) {
        $self->conf->{full_content} = 1;
    }
}

sub publish_feed {
    my($self, $context, $args) = @_;

    my $conf = $self->conf;
    my $f = $args->{feed};
    my $feed_format = $conf->{format} || 'Atom';

    local $XML::Atom::DefaultVersion = "1.0";

    # generate ATOM feed
    my $feed = XML::Feed->new($feed_format);
    $feed->title($f->title);
    $feed->link($f->link);
    $feed->modified(Plagger::Date->now);
    $feed->generator("Plagger/$Plagger::VERSION");
    $feed->description($f->description || '');
    $feed->author($f->author) if $f->primary_author;

    if ($feed_format eq 'Atom') {
        $feed->{atom}->id("tag:plagger.org,2006:" . $f->id);
    }

# generate RSS 1.0 
 my $rss = new XML::RSS (version => '1.0');
 $rss->channel(
   title        => $f->title,
   link         => $f->link,
   description  => $f->description,
   dc => {
     date       => Plagger::Date->now,
     subject    => "Linux Software",
     creator    => $f->author,
     publisher  => "Plagger/$Plagger::VERSION",
     rights     => 'Copyright 1999, Freshmeat.net',
     language   => 'en-us',
   },
   syn => {
     updatePeriod     => "hourly",
     updateFrequency  => "1",
     updateBase       => "1901-01-01T00:00+00:00",
   },
   taxo => [
     'http://dmoz.org/Computers/Internet',
     'http://dmoz.org/Computers/PC'
   ]
 );
# create entries
for my $e ($f->entries) {

# create RSS 1.0  entries
if ($feed_format eq 'RSS') {

 $rss->add_item(
   title       => $e->title,
   link        => $e->permalink,
   description => $e->body_text,
   dc => {
     publisher  => $e->publisher,
     subject  => $e->tags,
     creator  => $e->author,
     language => $e->language,
     date => $e->date,
   }
 );
 
}     # if
# create Atom entries
 else {
    # add entry
##    for my $e ($f->entries) {
        my $entry = XML::Feed::Entry->new($feed_format);
        $entry->title($e->title);
        $entry->link($e->permalink);
        $entry->summary($e->body_text) if defined $e->body;

        # hack to bypass XML::Feed Atom 0.3 crufts (type="text/html")
        if ($self->conf->{full_content} && defined $e->body) {
            if ($feed_format eq 'RSS') {
                $entry->content($e->body);
            } else {
                $entry->{entry}->content($e->body);
            }
        }  # if ($self->conf

        $entry->category($e->tags) if $e->tags;
        $entry->issued($e->date)   if $e->date;
        $entry->modified($e->date) if $e->date;

        $entry->id("tag:plagger.org,2006:" . $e->id);

# tenuki
#         $entry->category(join(' ', @{$e->tags}));

        if ($e->has_enclosure) {
            for my $enclosure (grep { defined $_->url && !$_->is_inline } $e->enclosures) {
                $entry->add_enclosure({
                    url    => $enclosure->url,
                    length => $enclosure->length,
                    type   => $enclosure->type,
                });

                # RSS 2.0 by spec doesn't allow multiple enclosures
                last if $feed_format eq 'RSS';
            } # for my $enclosure
        }    # if ($e->has_enclosure

        $feed->add_entry($entry);
}
}  # else


# generate file path
    my $filepath = File::Spec->catfile($self->conf->{dir}, $self->gen_filename($f));

    $context->log(info => "save feed for " . $f->link . " to $filepath" . $feed_format);

# generate RSS file
if ($feed_format eq 'RSS') {
    my $xml = $rss->as_string;
    utf8::decode($xml) unless utf8::is_utf8($xml);
    open my $output, ">" , $filepath or $context->error("$filepath: $!");
	print $output $xml;
    close $output;

} else {
    my $xml = $feed->as_xml;
    utf8::decode($xml) unless utf8::is_utf8($xml);
    open my $output, ">:utf8", $filepath or $context->error("$filepath: $!");
    # open my $output, $filepath or $context->error("$filepath: $!");
    print $output $xml;
    close $output;
}
}

my %formats = (
    'u' => sub { my $s = $_[0]->url;  $s =~ s!^https?://!!; $s },
    'l' => sub { my $s = $_[0]->link; $s =~ s!^https?://!!; $s },
    't' => sub { $_[0]->title },
    'i' => sub { $_[0]->id },
);

my $format_re = qr/%(u|l|t|i)/;

sub gen_filename {
    my($self, $feed) = @_;

    my $file = $self->conf->{filename} ||
        '%i.' . ($self->conf->{format} eq 'RSS' ? 'rss' : 'atom');
    $file =~ s{$format_re}{
        $self->safe_filename($formats{$1}->($feed))
    }egx;
    $file;
}

sub safe_filename {
    my($self, $path) = @_;
    $path =~ s![^\w\s]+!_!g;
    $path =~ s!\s+!_!g;
    $path;
}

# XXX okay, this is a hack until XML::Feed is updated
*XML::Feed::Entry::Atom::add_enclosure = sub {
    my($entry, $enclosure) = @_;
    my $link = XML::Atom::Link->new;
    $link->rel('enclosure');
    $link->type($enclosure->{type});
    $link->href($enclosure->{url});
    $link->length($enclosure->{length});
    $entry->{entry}->add_link($link);
};

*XML::Feed::Entry::RSS::add_enclosure = sub {
    my($entry, $enclosure) = @_;
    $entry->{entry}->{enclosure} = {
        url    => $enclosure->{url},
        type   => $enclosure->{type},
        length => $enclosure->{length},
    };
};


1;

__END__

  • Plagger::Plugin::Filter::EntryFullText;
14行目付近への追加
use Plagger::Tag;
149行目付近への追加
   $args->{entry}->tags(Plagger::Tag->parse($data->{tags})) if $data->{tags};
   # $args->{entry}->tags($data->{tags}) if $data->{tags};
   $args->{entry}->author($data->{author}) if $data->{author};
   $args->{entry}->publisher($data->{publisher}) if $data->{publisher};
   $args->{entry}->language($data->{language}) if $data->{language};

  • Plagger::Entry;
 5行目付近
__PACKAGE__->mk_accessors(qw( title author tags link feed_link id summary body rate icon meta source publisher language));
 103行目付近(1;の手前)
sub add_publisher {
    my $self = shift;
    $self->publisher;
}

sub add_language {
    my $self = shift;
    $self->langage;
}
sub add_tags {
    my $self = shift;
    $self->tags;
}

  • Plagger::Plugin::SmartFeed;
   $feed->description( $self->conf->{description} || $feed->title );
   $feed->author( $self->conf->{author} || $feed->title );

  • config.yamlの例
global:
 plugin_path:
  - D:\Perl\site\lib\Plagger\Plugin
 assets_path: D:\Perl\site\lib\Plagger\assets
 timezone: Asia/Tokyo
 log:
  level: debug
 cache:
  base: c:\plagger

# feedで指定したURLがある*.yamlを探し、データを取得しフィルタします
# urlをいじったときはEFTのyamlも修正する。
plugins:
  - module: Subscription::Config
    config:
      feed:
        - url: http://opac.lib.hit-u.ac.jp/opac-new/book1/8.html
        # - url: http://opac.lib.hit-u.ac.jp/opac-new/book1/new.html
        # - url: http://opac.lib.hit-u.ac.jp/opac-new/book2/new.html
          meta:
            follow_link: /opac/books-query\?mode=2\&place=\&code=\d+
# ?と&はエスケープしないといけないらしい。
# http://shakenbu.org/yanagi/d/20060923.html 
# EntryFullText で custom_feed_handle と custom_feed_follow_link / custom_feed_follow_xpath を指定した場合、 CustomFeed::Simple はいらない。
#  - module: CustomFeed::Config
#  - module: CustomFeed::Simple
# データの重複を排除します
#  - module: Filter::Rule
#    rule:
#      module: Deduped
#    path: C:\plagger\dedupe-hit-u.db
  - module: SmartFeed
    rule:
      module: Fresh
      mtime:
        path: C:\plagger\tmp\foo.tmp
        autoupdate: 1
    config:
      title: 一橋大学附属図書館新着図書
      link: http://opac.lib.hit-u.ac.jp/opac-new/book1/8.html
      description: 一橋大学附属図書館の新着図書リストからRSS1.0を生成します。
      author: 自分のお名前

# もしかするとEFTとCustomFeed::Configは両立しない? 
# 一致するものがなかった場合、<content:endoded>に入れるかどうか
  - module: Filter::EntryFullText
    config:
      store_html_on_failure: 0
# 取得したファイルからHTMLタグを除去します
  - module: Filter::HTMLScrubber
  - module: Publish::Feed2
    config:
      format: RSS
      dir: c:\plagger
      filename: rss-hit3.xml

  • /Plagger/assets/plugins/Filter-EntryFullText のhit-u.yaml
author: wono
custom_feed_handle: http://opac.lib.hit-u.ac.jp/opac-new/book1/8.html

custom_feed_follow_link: /opac/books-query\?mode=2\&place=\&code=\d+
handle: http://opac\.lib\.hit-u\.ac\.jp/opac/books-query\?mode=2\&place=\&code=\d+
extract: <H1 CLASS="TR">(.*?)</H1>.*?<SPAN CLASS="PUBL" ID="VALUE">(.*?)</SPAN>.+?<SPAN CLASS="AL" ID="VALUE"><A HREF="/opac/disp-query\?mode=2\&con1=3\&kywd1=[\%a-zA-Z0-9]+\&con2=3\&con3=4\&disp=1">(.*?)</A>.*?<SPAN CLASS="CLS" ID="VALUE">(.*?)</A>.*?<SPAN CLASS="TXTL" ID="VALUE">(.*?)</SPAN>

extract_capture: title publisher author tags language
extract_after_hook: |
  if ($data->{publisher} eq "") {$data->{publisher} = "test"};
  $data->{body} = $data->{publisher} . " author:" . $data->{author} . " lang:" . $data->{language};
  if (!$data->{tags}) {
  $data->{tags} = "ZZZZZZZZZZZZZZZZZZ";
  } else {
  $data->{tags} =~ s/\<[A-Za-z0-9 \%\/\?\-\&\=\"]+\>//ig;
  }