Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | package HTML::LinkExtor; |
2 | ||
3 | =head1 NAME | |
4 | ||
5 | HTML::LinkExtor - Extract links from an HTML document | |
6 | ||
7 | =head1 SYNOPSIS | |
8 | ||
9 | require HTML::LinkExtor; | |
10 | $p = HTML::LinkExtor->new(\&cb, "http://www.perl.org/"); | |
11 | sub cb { | |
12 | my($tag, %links) = @_; | |
13 | print "$tag @{[%links]}\n"; | |
14 | } | |
15 | $p->parse_file("index.html"); | |
16 | ||
17 | =head1 DESCRIPTION | |
18 | ||
19 | I<HTML::LinkExtor> is an HTML parser that extracts links from an | |
20 | HTML document. The I<HTML::LinkExtor> is a subclass of | |
21 | I<HTML::Parser>. This means that the document should be given to the | |
22 | parser by calling the $p->parse() or $p->parse_file() methods. | |
23 | ||
24 | =cut | |
25 | ||
26 | require HTML::Parser; | |
27 | @ISA = qw(HTML::Parser); | |
28 | $VERSION = sprintf("%d.%02d", q$Revision: 1.31 $ =~ /(\d+)\.(\d+)/); | |
29 | ||
30 | use strict; | |
31 | use HTML::Tagset (); | |
32 | ||
33 | # legacy (some applications grabs this hash directly) | |
34 | use vars qw(%LINK_ELEMENT); | |
35 | *LINK_ELEMENT = \%HTML::Tagset::linkElements; | |
36 | ||
37 | =over 4 | |
38 | ||
39 | =item $p = HTML::LinkExtor->new([$callback[, $base]]) | |
40 | ||
41 | The constructor takes two optional arguments. The first is a reference | |
42 | to a callback routine. It will be called as links are found. If a | |
43 | callback is not provided, then links are just accumulated internally | |
44 | and can be retrieved by calling the $p->links() method. | |
45 | ||
46 | The $base argument is an optional base URL used to absolutize all URLs found. | |
47 | You need to have the I<URI> module installed if you provide $base. | |
48 | ||
49 | The callback is called with the lowercase tag name as first argument, | |
50 | and then all link attributes as separate key/value pairs. All | |
51 | non-link attributes are removed. | |
52 | ||
53 | =cut | |
54 | ||
55 | sub new | |
56 | { | |
57 | my($class, $cb, $base) = @_; | |
58 | my $self = $class->SUPER::new( | |
59 | start_h => ["_start_tag", "self,tagname,attr"], | |
60 | report_tags => [keys %HTML::Tagset::linkElements], | |
61 | ); | |
62 | $self->{extractlink_cb} = $cb; | |
63 | if ($base) { | |
64 | require URI; | |
65 | $self->{extractlink_base} = URI->new($base); | |
66 | } | |
67 | $self; | |
68 | } | |
69 | ||
70 | sub _start_tag | |
71 | { | |
72 | my($self, $tag, $attr) = @_; | |
73 | ||
74 | my $base = $self->{extractlink_base}; | |
75 | my $links = $HTML::Tagset::linkElements{$tag}; | |
76 | $links = [$links] unless ref $links; | |
77 | ||
78 | my @links; | |
79 | my $a; | |
80 | for $a (@$links) { | |
81 | next unless exists $attr->{$a}; | |
82 | push(@links, $a, $base ? URI->new($attr->{$a}, $base)->abs($base) | |
83 | : $attr->{$a}); | |
84 | } | |
85 | return unless @links; | |
86 | $self->_found_link($tag, @links); | |
87 | } | |
88 | ||
89 | sub _found_link | |
90 | { | |
91 | my $self = shift; | |
92 | my $cb = $self->{extractlink_cb}; | |
93 | if ($cb) { | |
94 | &$cb(@_); | |
95 | } else { | |
96 | push(@{$self->{'links'}}, [@_]); | |
97 | } | |
98 | } | |
99 | ||
100 | =item $p->links | |
101 | ||
102 | Returns a list of all links found in the document. The returned | |
103 | values will be anonymous arrays with the follwing elements: | |
104 | ||
105 | [$tag, $attr => $url1, $attr2 => $url2,...] | |
106 | ||
107 | The $p->links method will also truncate the internal link list. This | |
108 | means that if the method is called twice without any parsing | |
109 | between them the second call will return an empty list. | |
110 | ||
111 | Also note that $p->links will always be empty if a callback routine | |
112 | was provided when the I<HTML::LinkExtor> was created. | |
113 | ||
114 | =cut | |
115 | ||
116 | sub links | |
117 | { | |
118 | my $self = shift; | |
119 | exists($self->{'links'}) ? @{delete $self->{'links'}} : (); | |
120 | } | |
121 | ||
122 | # We override the parse_file() method so that we can clear the links | |
123 | # before we start a new file. | |
124 | sub parse_file | |
125 | { | |
126 | my $self = shift; | |
127 | delete $self->{'links'}; | |
128 | $self->SUPER::parse_file(@_); | |
129 | } | |
130 | ||
131 | =back | |
132 | ||
133 | =head1 EXAMPLE | |
134 | ||
135 | This is an example showing how you can extract links from a document | |
136 | received using LWP: | |
137 | ||
138 | use LWP::UserAgent; | |
139 | use HTML::LinkExtor; | |
140 | use URI::URL; | |
141 | ||
142 | $url = "http://www.perl.org/"; # for instance | |
143 | $ua = LWP::UserAgent->new; | |
144 | ||
145 | # Set up a callback that collect image links | |
146 | my @imgs = (); | |
147 | sub callback { | |
148 | my($tag, %attr) = @_; | |
149 | return if $tag ne 'img'; # we only look closer at <img ...> | |
150 | push(@imgs, values %attr); | |
151 | } | |
152 | ||
153 | # Make the parser. Unfortunately, we don't know the base yet | |
154 | # (it might be diffent from $url) | |
155 | $p = HTML::LinkExtor->new(\&callback); | |
156 | ||
157 | # Request document and parse it as it arrives | |
158 | $res = $ua->request(HTTP::Request->new(GET => $url), | |
159 | sub {$p->parse($_[0])}); | |
160 | ||
161 | # Expand all image URLs to absolute ones | |
162 | my $base = $res->base; | |
163 | @imgs = map { $_ = url($_, $base)->abs; } @imgs; | |
164 | ||
165 | # Print them out | |
166 | print join("\n", @imgs), "\n"; | |
167 | ||
168 | =head1 SEE ALSO | |
169 | ||
170 | L<HTML::Parser>, L<HTML::Tagset>, L<LWP>, L<URI::URL> | |
171 | ||
172 | =head1 COPYRIGHT | |
173 | ||
174 | Copyright 1996-2001 Gisle Aas. | |
175 | ||
176 | This library is free software; you can redistribute it and/or | |
177 | modify it under the same terms as Perl itself. | |
178 | ||
179 | =cut | |
180 | ||
181 | 1; |