superopac @Wiki

Plagger RSS 1.0 モジュール

最終更新：2006年10月12日 14:04

匿名ユーザー

- view

だれでも歓迎！編集

Plagger で RSS1.0 を出力するモジュール

　plaggerで新着図書一覧ページから、RSS1.0でdcモジュールを使って見たかったので、作って見ました。
　テスト環境が、ActivePerl＋XPなので、それ以外の環境ではディレクトリの指定など適宜修正してください。
　コーディングとしては手抜きの箇所が多々ありますが、一応動くので、載せておきます。

作成・修正が必要なモジュール

Plagger::Plugin::Publish::Feed2;
Plagger::Plugin::Filter::EntryFullText;
Plagger::Entry;
Plagger::Plugin::SmartFeed;

yamlの例

config.yamlの例
/Plagger/assets/plugins/Filter-EntryFullTextのhit-u.yamlの例

モジュールの修正箇所

Plagger::Plugin::Publish::Feed2;

package Plagger::Plugin::Publish::Feed2;

use strict;
use base qw( Plagger::Plugin );

our $VERSION = 0.01;

use XML::Feed;
use XML::Feed::Entry;
use XML::RSS::LibXML;
use File::Spec;
use XML::RSS;

$XML::Feed::RSS::PREFERRED_PARSER = "XML::RSS::LibXML";

sub register {
    my($self, $context) = @_;
    $context->register_hook(
        $self,
        'publish.feed' => \&publish_feed,
    );
    $self->init_feed($context);
}

sub init_feed {
    my($self, $context) = @_;

    # check dir
    my $dir = $self->conf->{dir};
    unless (-e $dir && -d _) {
        mkdir $dir, 0755 or $context->error("mkdir $dir: $!");
    }

    unless (exists $self->conf->{full_content}) {
        $self->conf->{full_content} = 1;
    }
}

sub publish_feed {
    my($self, $context, $args) = @_;

    my $conf = $self->conf;
    my $f = $args->{feed};
    my $feed_format = $conf->{format} || 'Atom';

    local $XML::Atom::DefaultVersion = "1.0";

    # generate ATOM feed
    my $feed = XML::Feed->new($feed_format);
    $feed->title($f->title);
    $feed->link($f->link);
    $feed->modified(Plagger::Date->now);
    $feed->generator("Plagger/$Plagger::VERSION");
    $feed->description($f->description || '');
    $feed->author($f->author) if $f->primary_author;

    if ($feed_format eq 'Atom') {
        $feed->{atom}->id("tag:plagger.org,2006:" . $f->id);
    }

# generate RSS 1.0 
 my $rss = new XML::RSS (version => '1.0');
 $rss->channel(
   title        => $f->title,
   link         => $f->link,
   description  => $f->description,
   dc => {
     date       => Plagger::Date->now,
     subject    => "Linux Software",
     creator    => $f->author,
     publisher  => "Plagger/$Plagger::VERSION",
     rights     => 'Copyright 1999, Freshmeat.net',
     language   => 'en-us',
   },
   syn => {
     updatePeriod     => "hourly",
     updateFrequency  => "1",
     updateBase       => "1901-01-01T00:00+00:00",
   },
   taxo => [
     'http://dmoz.org/Computers/Internet',
     'http://dmoz.org/Computers/PC'
   ]
 );
# create entries
for my $e ($f->entries) {

# create RSS 1.0  entries
if ($feed_format eq 'RSS') {

 $rss->add_item(
   title       => $e->title,
   link        => $e->permalink,
   description => $e->body_text,
   dc => {
     publisher  => $e->publisher,
     subject  => $e->tags,
     creator  => $e->author,
     language => $e->language,
     date => $e->date,
   }
 );
 
}     # if
# create Atom entries
 else {
    # add entry
##    for my $e ($f->entries) {
        my $entry = XML::Feed::Entry->new($feed_format);
        $entry->title($e->title);
        $entry->link($e->permalink);
        $entry->summary($e->body_text) if defined $e->body;

        # hack to bypass XML::Feed Atom 0.3 crufts (type="text/html")
        if ($self->conf->{full_content} && defined $e->body) {
            if ($feed_format eq 'RSS') {
                $entry->content($e->body);
            } else {
                $entry->{entry}->content($e->body);
            }
        }  # if ($self->conf

        $entry->category($e->tags) if $e->tags;
        $entry->issued($e->date)   if $e->date;
        $entry->modified($e->date) if $e->date;

        $entry->id("tag:plagger.org,2006:" . $e->id);

# tenuki
#         $entry->category(join(' ', @{$e->tags}));

        if ($e->has_enclosure) {
            for my $enclosure (grep { defined $_->url && !$_->is_inline } $e->enclosures) {
                $entry->add_enclosure({
                    url    => $enclosure->url,
                    length => $enclosure->length,
                    type   => $enclosure->type,
                });

                # RSS 2.0 by spec doesn't allow multiple enclosures
                last if $feed_format eq 'RSS';
            } # for my $enclosure
        }    # if ($e->has_enclosure

        $feed->add_entry($entry);
}
}  # else


# generate file path
    my $filepath = File::Spec->catfile($self->conf->{dir}, $self->gen_filename($f));

    $context->log(info => "save feed for " . $f->link . " to $filepath" . $feed_format);

# generate RSS file
if ($feed_format eq 'RSS') {
    my $xml = $rss->as_string;
    utf8::decode($xml) unless utf8::is_utf8($xml);
    open my $output, ">" , $filepath or $context->error("$filepath: $!");
	print $output $xml;
    close $output;

} else {
    my $xml = $feed->as_xml;
    utf8::decode($xml) unless utf8::is_utf8($xml);
    open my $output, ">:utf8", $filepath or $context->error("$filepath: $!");
    # open my $output, $filepath or $context->error("$filepath: $!");
    print $output $xml;
    close $output;
}
}

my %formats = (
    'u' => sub { my $s = $_[0]->url;  $s =~ s!^https?://!!; $s },
    'l' => sub { my $s = $_[0]->link; $s =~ s!^https?://!!; $s },
    't' => sub { $_[0]->title },
    'i' => sub { $_[0]->id },
);

my $format_re = qr/%(u|l|t|i)/;

sub gen_filename {
    my($self, $feed) = @_;

    my $file = $self->conf->{filename} ||
        '%i.' . ($self->conf->{format} eq 'RSS' ? 'rss' : 'atom');
    $file =~ s{$format_re}{
        $self->safe_filename($formats{$1}->($feed))
    }egx;
    $file;
}

sub safe_filename {
    my($self, $path) = @_;
    $path =~ s![^\w\s]+!_!g;
    $path =~ s!\s+!_!g;
    $path;
}

# XXX okay, this is a hack until XML::Feed is updated
*XML::Feed::Entry::Atom::add_enclosure = sub {
    my($entry, $enclosure) = @_;
    my $link = XML::Atom::Link->new;
    $link->rel('enclosure');
    $link->type($enclosure->{type});
    $link->href($enclosure->{url});
    $link->length($enclosure->{length});
    $entry->{entry}->add_link($link);
};

*XML::Feed::Entry::RSS::add_enclosure = sub {
    my($entry, $enclosure) = @_;
    $entry->{entry}->{enclosure} = {
        url    => $enclosure->{url},
        type   => $enclosure->{type},
        length => $enclosure->{length},
    };
};


1;

__END__

Plagger::Plugin::Filter::EntryFullText;

14行目付近への追加

use Plagger::Tag;

149行目付近への追加

   $args->{entry}->tags(Plagger::Tag->parse($data->{tags})) if $data->{tags};
   # $args->{entry}->tags($data->{tags}) if $data->{tags};
   $args->{entry}->author($data->{author}) if $data->{author};
   $args->{entry}->publisher($data->{publisher}) if $data->{publisher};
   $args->{entry}->language($data->{language}) if $data->{language};

Plagger::Entry;

　5行目付近

__PACKAGE__->mk_accessors(qw( title author tags link feed_link id summary body rate icon meta source publisher language));

　103行目付近(1;の手前)

sub add_publisher {
    my $self = shift;
    $self->publisher;
}

sub add_language {
    my $self = shift;
    $self->langage;
}
sub add_tags {
    my $self = shift;
    $self->tags;
}

Plagger::Plugin::SmartFeed;

   $feed->description( $self->conf->{description} || $feed->title );
   $feed->author( $self->conf->{author} || $feed->title );

config.yamlの例

global:
 plugin_path:
  - D:\Perl\site\lib\Plagger\Plugin
 assets_path: D:\Perl\site\lib\Plagger\assets
 timezone: Asia/Tokyo
 log:
  level: debug
 cache:
  base: c:\plagger

# feedで指定したURLがある*.yamlを探し、データを取得しフィルタします
# urlをいじったときはEFTのyamlも修正する。
plugins:
  - module: Subscription::Config
    config:
      feed:
        - url: http://opac.lib.hit-u.ac.jp/opac-new/book1/8.html
        # - url: http://opac.lib.hit-u.ac.jp/opac-new/book1/new.html
        # - url: http://opac.lib.hit-u.ac.jp/opac-new/book2/new.html
          meta:
            follow_link: /opac/books-query\?mode=2\&place=\&code=\d+
# ?と&はエスケープしないといけないらしい。
# http://shakenbu.org/yanagi/d/20060923.html 
# EntryFullText で custom_feed_handle と custom_feed_follow_link / custom_feed_follow_xpath を指定した場合、 CustomFeed::Simple はいらない。
#  - module: CustomFeed::Config
#  - module: CustomFeed::Simple
# データの重複を排除します
#  - module: Filter::Rule
#    rule:
#      module: Deduped
#    path: C:\plagger\dedupe-hit-u.db
  - module: SmartFeed
    rule:
      module: Fresh
      mtime:
        path: C:\plagger\tmp\foo.tmp
        autoupdate: 1
    config:
      title: 一橋大学附属図書館新着図書
      link: http://opac.lib.hit-u.ac.jp/opac-new/book1/8.html
      description: 一橋大学附属図書館の新着図書リストからRSS1.0を生成します。
      author: 自分のお名前

# もしかするとEFTとCustomFeed::Configは両立しない？ 
# 一致するものがなかった場合、<content:endoded>に入れるかどうか
  - module: Filter::EntryFullText
    config:
      store_html_on_failure: 0
# 取得したファイルからHTMLタグを除去します
  - module: Filter::HTMLScrubber
  - module: Publish::Feed2
    config:
      format: RSS
      dir: c:\plagger
      filename: rss-hit3.xml

/Plagger/assets/plugins/Filter-EntryFullText のhit-u.yaml

author: wono
custom_feed_handle: http://opac.lib.hit-u.ac.jp/opac-new/book1/8.html

custom_feed_follow_link: /opac/books-query\?mode=2\&place=\&code=\d+
handle: http://opac\.lib\.hit-u\.ac\.jp/opac/books-query\?mode=2\&place=\&code=\d+
extract: <H1 CLASS="TR">(.*?)</H1>.*?<SPAN CLASS="PUBL" ID="VALUE">(.*?)</SPAN>.+?<SPAN CLASS="AL" ID="VALUE"><A HREF="/opac/disp-query\?mode=2\&con1=3\&kywd1=[\%a-zA-Z0-9]+\&con2=3\&con3=4\&disp=1">(.*?)</A>.*?<SPAN CLASS="CLS" ID="VALUE">(.*?)</A>.*?<SPAN CLASS="TXTL" ID="VALUE">(.*?)</SPAN>

extract_capture: title publisher author tags language
extract_after_hook: |
  if ($data->{publisher} eq "") {$data->{publisher} = "test"};
  $data->{body} = $data->{publisher} . " author:" . $data->{author} . " lang:" . $data->{language};
  if (!$data->{tags}) {
  $data->{tags} = "ZZZZZZZZZZZZZZZZZZ";
  } else {
  $data->{tags} =~ s/\<[A-Za-z0-9 \%\/\?\-\&\=\"]+\>//ig;
  }

タグ：

plagger

+ タグ編集

「Plagger RSS 1.0 モジュール」をウィキ内検索

最近更新されたページ

人気Wikiランキング

atwikiでよく見られているWikiのランキングです。新しい情報を発見してみよう！

全体ページランキング

最近アクセスの多かったページランキングです。話題のページを見に行こう！

Plagger RSS 1.0 モジュール

Plagger で RSS1.0 を出力するモジュール

作成・修正が必要なモジュール

yamlの例

モジュールの修正箇所

更新履歴