「Plagger RSS 1.0 モジュール」の編集履歴(バックアップ)一覧はこちら
「Plagger RSS 1.0 モジュール」(2006/10/12 (木) 14:04:09) の最新版変更点
追加された行は緑色になります。
削除された行は赤色になります。
*Plagger で RSS1.0 を出力するモジュール
plaggerで新着図書一覧ページから、RSS1.0でdcモジュールを使って見たかったので、作って見ました。
テスト環境が、ActivePerl+XPなので、それ以外の環境ではディレクトリの指定など適宜修正してください。
**作成・修正が必要なモジュール
- Plagger::Plugin::Publish::Feed2;
- Plagger::Plugin::Filter::EntryFullText;
- Plagger::Entry;
- Plagger::Plugin::SmartFeed;
**yamlの例
- config.yamlの例
- /Plagger/assets/plugins/Filter-EntryFullTextのhit-u.yamlの例
**モジュールの修正箇所
- Plagger::Plugin::Publish::Feed2;
package Plagger::Plugin::Publish::Feed2;
use strict;
use base qw( Plagger::Plugin );
our $VERSION = 0.01;
use XML::Feed;
use XML::Feed::Entry;
use XML::RSS::LibXML;
use File::Spec;
use XML::RSS;
$XML::Feed::RSS::PREFERRED_PARSER = "XML::RSS::LibXML";
sub register {
my($self, $context) = @_;
$context->register_hook(
$self,
'publish.feed' => \&publish_feed,
);
$self->init_feed($context);
}
sub init_feed {
my($self, $context) = @_;
# check dir
my $dir = $self->conf->{dir};
unless (-e $dir && -d _) {
mkdir $dir, 0755 or $context->error("mkdir $dir: $!");
}
unless (exists $self->conf->{full_content}) {
$self->conf->{full_content} = 1;
}
}
sub publish_feed {
my($self, $context, $args) = @_;
my $conf = $self->conf;
my $f = $args->{feed};
my $feed_format = $conf->{format} || 'Atom';
local $XML::Atom::DefaultVersion = "1.0";
# generate ATOM feed
my $feed = XML::Feed->new($feed_format);
$feed->title($f->title);
$feed->link($f->link);
$feed->modified(Plagger::Date->now);
$feed->generator("Plagger/$Plagger::VERSION");
$feed->description($f->description || '');
$feed->author($f->author) if $f->primary_author;
if ($feed_format eq 'Atom') {
$feed->{atom}->id("tag:plagger.org,2006:" . $f->id);
}
# generate RSS 1.0
my $rss = new XML::RSS (version => '1.0');
$rss->channel(
title => $f->title,
link => $f->link,
description => $f->description,
dc => {
date => Plagger::Date->now,
subject => "Linux Software",
creator => $f->author,
publisher => "Plagger/$Plagger::VERSION",
rights => 'Copyright 1999, Freshmeat.net',
language => 'en-us',
},
syn => {
updatePeriod => "hourly",
updateFrequency => "1",
updateBase => "1901-01-01T00:00+00:00",
},
taxo => [
'http://dmoz.org/Computers/Internet',
'http://dmoz.org/Computers/PC'
]
);
# create entries
for my $e ($f->entries) {
# create RSS 1.0 entries
if ($feed_format eq 'RSS') {
$rss->add_item(
title => $e->title,
link => $e->permalink,
description => $e->body_text,
dc => {
publisher => $e->publisher,
subject => $e->tags,
creator => $e->author,
language => $e->language,
date => $e->date,
}
);
} # if
# create Atom entries
else {
# add entry
## for my $e ($f->entries) {
my $entry = XML::Feed::Entry->new($feed_format);
$entry->title($e->title);
$entry->link($e->permalink);
$entry->summary($e->body_text) if defined $e->body;
# hack to bypass XML::Feed Atom 0.3 crufts (type="text/html")
if ($self->conf->{full_content} && defined $e->body) {
if ($feed_format eq 'RSS') {
$entry->content($e->body);
} else {
$entry->{entry}->content($e->body);
}
} # if ($self->conf
$entry->category($e->tags) if $e->tags;
$entry->issued($e->date) if $e->date;
$entry->modified($e->date) if $e->date;
$entry->id("tag:plagger.org,2006:" . $e->id);
# tenuki
# $entry->category(join(' ', @{$e->tags}));
if ($e->has_enclosure) {
for my $enclosure (grep { defined $_->url && !$_->is_inline } $e->enclosures) {
$entry->add_enclosure({
url => $enclosure->url,
length => $enclosure->length,
type => $enclosure->type,
});
# RSS 2.0 by spec doesn't allow multiple enclosures
last if $feed_format eq 'RSS';
} # for my $enclosure
} # if ($e->has_enclosure
$feed->add_entry($entry);
}
} # else
# generate file path
my $filepath = File::Spec->catfile($self->conf->{dir}, $self->gen_filename($f));
$context->log(info => "save feed for " . $f->link . " to $filepath" . $feed_format);
# generate RSS file
if ($feed_format eq 'RSS') {
my $xml = $rss->as_string;
utf8::decode($xml) unless utf8::is_utf8($xml);
open my $output, ">" , $filepath or $context->error("$filepath: $!");
print $output $xml;
close $output;
} else {
my $xml = $feed->as_xml;
utf8::decode($xml) unless utf8::is_utf8($xml);
open my $output, ">:utf8", $filepath or $context->error("$filepath: $!");
# open my $output, $filepath or $context->error("$filepath: $!");
print $output $xml;
close $output;
}
}
my %formats = (
'u' => sub { my $s = $_[0]->url; $s =~ s!^https?://!!; $s },
'l' => sub { my $s = $_[0]->link; $s =~ s!^https?://!!; $s },
't' => sub { $_[0]->title },
'i' => sub { $_[0]->id },
);
my $format_re = qr/%(u|l|t|i)/;
sub gen_filename {
my($self, $feed) = @_;
my $file = $self->conf->{filename} ||
'%i.' . ($self->conf->{format} eq 'RSS' ? 'rss' : 'atom');
$file =~ s{$format_re}{
$self->safe_filename($formats{$1}->($feed))
}egx;
$file;
}
sub safe_filename {
my($self, $path) = @_;
$path =~ s![^\w\s]+!_!g;
$path =~ s!\s+!_!g;
$path;
}
# XXX okay, this is a hack until XML::Feed is updated
*XML::Feed::Entry::Atom::add_enclosure = sub {
my($entry, $enclosure) = @_;
my $link = XML::Atom::Link->new;
$link->rel('enclosure');
$link->type($enclosure->{type});
$link->href($enclosure->{url});
$link->length($enclosure->{length});
$entry->{entry}->add_link($link);
};
*XML::Feed::Entry::RSS::add_enclosure = sub {
my($entry, $enclosure) = @_;
$entry->{entry}->{enclosure} = {
url => $enclosure->{url},
type => $enclosure->{type},
length => $enclosure->{length},
};
};
1;
__END__
- Plagger::Plugin::Filter::EntryFullText;
14行目付近への追加
use Plagger::Tag;
149行目付近への追加
$args->{entry}->tags(Plagger::Tag->parse($data->{tags})) if $data->{tags};
# $args->{entry}->tags($data->{tags}) if $data->{tags};
$args->{entry}->author($data->{author}) if $data->{author};
$args->{entry}->publisher($data->{publisher}) if $data->{publisher};
$args->{entry}->language($data->{language}) if $data->{language};
- Plagger::Entry;
5行目付近
__PACKAGE__->mk_accessors(qw( title author tags link feed_link id summary body rate icon meta source publisher language));
103行目付近(1;の手前)
sub add_publisher {
my $self = shift;
$self->publisher;
}
sub add_language {
my $self = shift;
$self->langage;
}
sub add_tags {
my $self = shift;
$self->tags;
}
- Plagger::Plugin::SmartFeed;
$feed->description( $self->conf->{description} || $feed->title );
$feed->author( $self->conf->{author} || $feed->title );
- config.yamlの例
global:
plugin_path:
- D:\Perl\site\lib\Plagger\Plugin
assets_path: D:\Perl\site\lib\Plagger\assets
timezone: Asia/Tokyo
log:
level: debug
cache:
base: c:\plagger
# feedで指定したURLがある*.yamlを探し、データを取得しフィルタします
# urlをいじったときはEFTのyamlも修正する。
plugins:
- module: Subscription::Config
config:
feed:
- url: http://opac.lib.hit-u.ac.jp/opac-new/book1/8.html
# - url: http://opac.lib.hit-u.ac.jp/opac-new/book1/new.html
# - url: http://opac.lib.hit-u.ac.jp/opac-new/book2/new.html
meta:
follow_link: /opac/books-query\?mode=2\&place=\&code=\d+
# ?と&はエスケープしないといけないらしい。
# http://shakenbu.org/yanagi/d/20060923.html
# EntryFullText で custom_feed_handle と custom_feed_follow_link / custom_feed_follow_xpath を指定した場合、 CustomFeed::Simple はいらない。
# - module: CustomFeed::Config
# - module: CustomFeed::Simple
# データの重複を排除します
# - module: Filter::Rule
# rule:
# module: Deduped
# path: C:\plagger\dedupe-hit-u.db
- module: SmartFeed
rule:
module: Fresh
mtime:
path: C:\plagger\tmp\foo.tmp
autoupdate: 1
config:
title: 一橋大学附属図書館新着図書
link: http://opac.lib.hit-u.ac.jp/opac-new/book1/8.html
description: 一橋大学附属図書館の新着図書リストからRSS1.0を生成します。
author: 自分のお名前
# もしかするとEFTとCustomFeed::Configは両立しない?
# 一致するものがなかった場合、<content:endoded>に入れるかどうか
- module: Filter::EntryFullText
config:
store_html_on_failure: 0
# 取得したファイルからHTMLタグを除去します
- module: Filter::HTMLScrubber
- module: Publish::Feed2
config:
format: RSS
dir: c:\plagger
filename: rss-hit3.xml
- /Plagger/assets/plugins/Filter-EntryFullText のhit-u.yaml
author: wono
custom_feed_handle: http://opac.lib.hit-u.ac.jp/opac-new/book1/8.html
custom_feed_follow_link: /opac/books-query\?mode=2\&place=\&code=\d+
handle: http://opac\.lib\.hit-u\.ac\.jp/opac/books-query\?mode=2\&place=\&code=\d+
extract: <H1 CLASS="TR">(.*?)</H1>.*?<SPAN CLASS="PUBL" ID="VALUE">(.*?)</SPAN>.+?<SPAN CLASS="AL" ID="VALUE"><A HREF="/opac/disp-query\?mode=2\&con1=3\&kywd1=[\%a-zA-Z0-9]+\&con2=3\&con3=4\&disp=1">(.*?)</A>.*?<SPAN CLASS="CLS" ID="VALUE">(.*?)</A>.*?<SPAN CLASS="TXTL" ID="VALUE">(.*?)</SPAN>
extract_capture: title publisher author tags language
extract_after_hook: |
if ($data->{publisher} eq "") {$data->{publisher} = "test"};
$data->{body} = $data->{publisher} . " author:" . $data->{author} . " lang:" . $data->{language};
if (!$data->{tags}) {
$data->{tags} = "ZZZZZZZZZZZZZZZZZZ";
} else {
$data->{tags} =~ s/\<[A-Za-z0-9 \%\/\?\-\&\=\"]+\>//ig;
}
*Plagger で RSS1.0 を出力するモジュール
plaggerで新着図書一覧ページから、RSS1.0でdcモジュールを使って見たかったので、作って見ました。
テスト環境が、ActivePerl+XPなので、それ以外の環境ではディレクトリの指定など適宜修正してください。
コーディングとしては手抜きの箇所が多々ありますが、一応動くので、載せておきます。
**作成・修正が必要なモジュール
- Plagger::Plugin::Publish::Feed2;
- Plagger::Plugin::Filter::EntryFullText;
- Plagger::Entry;
- Plagger::Plugin::SmartFeed;
**yamlの例
- config.yamlの例
- /Plagger/assets/plugins/Filter-EntryFullTextのhit-u.yamlの例
**モジュールの修正箇所
- Plagger::Plugin::Publish::Feed2;
package Plagger::Plugin::Publish::Feed2;
use strict;
use base qw( Plagger::Plugin );
our $VERSION = 0.01;
use XML::Feed;
use XML::Feed::Entry;
use XML::RSS::LibXML;
use File::Spec;
use XML::RSS;
$XML::Feed::RSS::PREFERRED_PARSER = "XML::RSS::LibXML";
sub register {
my($self, $context) = @_;
$context->register_hook(
$self,
'publish.feed' => \&publish_feed,
);
$self->init_feed($context);
}
sub init_feed {
my($self, $context) = @_;
# check dir
my $dir = $self->conf->{dir};
unless (-e $dir && -d _) {
mkdir $dir, 0755 or $context->error("mkdir $dir: $!");
}
unless (exists $self->conf->{full_content}) {
$self->conf->{full_content} = 1;
}
}
sub publish_feed {
my($self, $context, $args) = @_;
my $conf = $self->conf;
my $f = $args->{feed};
my $feed_format = $conf->{format} || 'Atom';
local $XML::Atom::DefaultVersion = "1.0";
# generate ATOM feed
my $feed = XML::Feed->new($feed_format);
$feed->title($f->title);
$feed->link($f->link);
$feed->modified(Plagger::Date->now);
$feed->generator("Plagger/$Plagger::VERSION");
$feed->description($f->description || '');
$feed->author($f->author) if $f->primary_author;
if ($feed_format eq 'Atom') {
$feed->{atom}->id("tag:plagger.org,2006:" . $f->id);
}
# generate RSS 1.0
my $rss = new XML::RSS (version => '1.0');
$rss->channel(
title => $f->title,
link => $f->link,
description => $f->description,
dc => {
date => Plagger::Date->now,
subject => "Linux Software",
creator => $f->author,
publisher => "Plagger/$Plagger::VERSION",
rights => 'Copyright 1999, Freshmeat.net',
language => 'en-us',
},
syn => {
updatePeriod => "hourly",
updateFrequency => "1",
updateBase => "1901-01-01T00:00+00:00",
},
taxo => [
'http://dmoz.org/Computers/Internet',
'http://dmoz.org/Computers/PC'
]
);
# create entries
for my $e ($f->entries) {
# create RSS 1.0 entries
if ($feed_format eq 'RSS') {
$rss->add_item(
title => $e->title,
link => $e->permalink,
description => $e->body_text,
dc => {
publisher => $e->publisher,
subject => $e->tags,
creator => $e->author,
language => $e->language,
date => $e->date,
}
);
} # if
# create Atom entries
else {
# add entry
## for my $e ($f->entries) {
my $entry = XML::Feed::Entry->new($feed_format);
$entry->title($e->title);
$entry->link($e->permalink);
$entry->summary($e->body_text) if defined $e->body;
# hack to bypass XML::Feed Atom 0.3 crufts (type="text/html")
if ($self->conf->{full_content} && defined $e->body) {
if ($feed_format eq 'RSS') {
$entry->content($e->body);
} else {
$entry->{entry}->content($e->body);
}
} # if ($self->conf
$entry->category($e->tags) if $e->tags;
$entry->issued($e->date) if $e->date;
$entry->modified($e->date) if $e->date;
$entry->id("tag:plagger.org,2006:" . $e->id);
# tenuki
# $entry->category(join(' ', @{$e->tags}));
if ($e->has_enclosure) {
for my $enclosure (grep { defined $_->url && !$_->is_inline } $e->enclosures) {
$entry->add_enclosure({
url => $enclosure->url,
length => $enclosure->length,
type => $enclosure->type,
});
# RSS 2.0 by spec doesn't allow multiple enclosures
last if $feed_format eq 'RSS';
} # for my $enclosure
} # if ($e->has_enclosure
$feed->add_entry($entry);
}
} # else
# generate file path
my $filepath = File::Spec->catfile($self->conf->{dir}, $self->gen_filename($f));
$context->log(info => "save feed for " . $f->link . " to $filepath" . $feed_format);
# generate RSS file
if ($feed_format eq 'RSS') {
my $xml = $rss->as_string;
utf8::decode($xml) unless utf8::is_utf8($xml);
open my $output, ">" , $filepath or $context->error("$filepath: $!");
print $output $xml;
close $output;
} else {
my $xml = $feed->as_xml;
utf8::decode($xml) unless utf8::is_utf8($xml);
open my $output, ">:utf8", $filepath or $context->error("$filepath: $!");
# open my $output, $filepath or $context->error("$filepath: $!");
print $output $xml;
close $output;
}
}
my %formats = (
'u' => sub { my $s = $_[0]->url; $s =~ s!^https?://!!; $s },
'l' => sub { my $s = $_[0]->link; $s =~ s!^https?://!!; $s },
't' => sub { $_[0]->title },
'i' => sub { $_[0]->id },
);
my $format_re = qr/%(u|l|t|i)/;
sub gen_filename {
my($self, $feed) = @_;
my $file = $self->conf->{filename} ||
'%i.' . ($self->conf->{format} eq 'RSS' ? 'rss' : 'atom');
$file =~ s{$format_re}{
$self->safe_filename($formats{$1}->($feed))
}egx;
$file;
}
sub safe_filename {
my($self, $path) = @_;
$path =~ s![^\w\s]+!_!g;
$path =~ s!\s+!_!g;
$path;
}
# XXX okay, this is a hack until XML::Feed is updated
*XML::Feed::Entry::Atom::add_enclosure = sub {
my($entry, $enclosure) = @_;
my $link = XML::Atom::Link->new;
$link->rel('enclosure');
$link->type($enclosure->{type});
$link->href($enclosure->{url});
$link->length($enclosure->{length});
$entry->{entry}->add_link($link);
};
*XML::Feed::Entry::RSS::add_enclosure = sub {
my($entry, $enclosure) = @_;
$entry->{entry}->{enclosure} = {
url => $enclosure->{url},
type => $enclosure->{type},
length => $enclosure->{length},
};
};
1;
__END__
- Plagger::Plugin::Filter::EntryFullText;
14行目付近への追加
use Plagger::Tag;
149行目付近への追加
$args->{entry}->tags(Plagger::Tag->parse($data->{tags})) if $data->{tags};
# $args->{entry}->tags($data->{tags}) if $data->{tags};
$args->{entry}->author($data->{author}) if $data->{author};
$args->{entry}->publisher($data->{publisher}) if $data->{publisher};
$args->{entry}->language($data->{language}) if $data->{language};
- Plagger::Entry;
5行目付近
__PACKAGE__->mk_accessors(qw( title author tags link feed_link id summary body rate icon meta source publisher language));
103行目付近(1;の手前)
sub add_publisher {
my $self = shift;
$self->publisher;
}
sub add_language {
my $self = shift;
$self->langage;
}
sub add_tags {
my $self = shift;
$self->tags;
}
- Plagger::Plugin::SmartFeed;
$feed->description( $self->conf->{description} || $feed->title );
$feed->author( $self->conf->{author} || $feed->title );
- config.yamlの例
global:
plugin_path:
- D:\Perl\site\lib\Plagger\Plugin
assets_path: D:\Perl\site\lib\Plagger\assets
timezone: Asia/Tokyo
log:
level: debug
cache:
base: c:\plagger
# feedで指定したURLがある*.yamlを探し、データを取得しフィルタします
# urlをいじったときはEFTのyamlも修正する。
plugins:
- module: Subscription::Config
config:
feed:
- url: http://opac.lib.hit-u.ac.jp/opac-new/book1/8.html
# - url: http://opac.lib.hit-u.ac.jp/opac-new/book1/new.html
# - url: http://opac.lib.hit-u.ac.jp/opac-new/book2/new.html
meta:
follow_link: /opac/books-query\?mode=2\&place=\&code=\d+
# ?と&はエスケープしないといけないらしい。
# http://shakenbu.org/yanagi/d/20060923.html
# EntryFullText で custom_feed_handle と custom_feed_follow_link / custom_feed_follow_xpath を指定した場合、 CustomFeed::Simple はいらない。
# - module: CustomFeed::Config
# - module: CustomFeed::Simple
# データの重複を排除します
# - module: Filter::Rule
# rule:
# module: Deduped
# path: C:\plagger\dedupe-hit-u.db
- module: SmartFeed
rule:
module: Fresh
mtime:
path: C:\plagger\tmp\foo.tmp
autoupdate: 1
config:
title: 一橋大学附属図書館新着図書
link: http://opac.lib.hit-u.ac.jp/opac-new/book1/8.html
description: 一橋大学附属図書館の新着図書リストからRSS1.0を生成します。
author: 自分のお名前
# もしかするとEFTとCustomFeed::Configは両立しない?
# 一致するものがなかった場合、<content:endoded>に入れるかどうか
- module: Filter::EntryFullText
config:
store_html_on_failure: 0
# 取得したファイルからHTMLタグを除去します
- module: Filter::HTMLScrubber
- module: Publish::Feed2
config:
format: RSS
dir: c:\plagger
filename: rss-hit3.xml
- /Plagger/assets/plugins/Filter-EntryFullText のhit-u.yaml
author: wono
custom_feed_handle: http://opac.lib.hit-u.ac.jp/opac-new/book1/8.html
custom_feed_follow_link: /opac/books-query\?mode=2\&place=\&code=\d+
handle: http://opac\.lib\.hit-u\.ac\.jp/opac/books-query\?mode=2\&place=\&code=\d+
extract: <H1 CLASS="TR">(.*?)</H1>.*?<SPAN CLASS="PUBL" ID="VALUE">(.*?)</SPAN>.+?<SPAN CLASS="AL" ID="VALUE"><A HREF="/opac/disp-query\?mode=2\&con1=3\&kywd1=[\%a-zA-Z0-9]+\&con2=3\&con3=4\&disp=1">(.*?)</A>.*?<SPAN CLASS="CLS" ID="VALUE">(.*?)</A>.*?<SPAN CLASS="TXTL" ID="VALUE">(.*?)</SPAN>
extract_capture: title publisher author tags language
extract_after_hook: |
if ($data->{publisher} eq "") {$data->{publisher} = "test"};
$data->{body} = $data->{publisher} . " author:" . $data->{author} . " lang:" . $data->{language};
if (!$data->{tags}) {
$data->{tags} = "ZZZZZZZZZZZZZZZZZZ";
} else {
$data->{tags} =~ s/\<[A-Za-z0-9 \%\/\?\-\&\=\"]+\>//ig;
}
表示オプション
横に並べて表示:
変化行の前後のみ表示: