|
1 """ robotparser.py |
|
2 |
|
3 Copyright (C) 2000 Bastian Kleineidam |
|
4 |
|
5 You can choose between two licenses when using this package: |
|
6 1) GNU GPLv2 |
|
7 2) PSF license for Python 2.2 |
|
8 |
|
9 The robots.txt Exclusion Protocol is implemented as specified in |
|
10 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html |
|
11 """ |
|
12 import urlparse,urllib |
|
13 |
|
14 __all__ = ["RobotFileParser"] |
|
15 |
|
16 debug = 0 |
|
17 |
|
18 def _debug(msg): |
|
19 if debug: print msg |
|
20 |
|
21 |
|
22 class RobotFileParser: |
|
23 """ This class provides a set of methods to read, parse and answer |
|
24 questions about a single robots.txt file. |
|
25 |
|
26 """ |
|
27 |
|
28 def __init__(self, url=''): |
|
29 self.entries = [] |
|
30 self.default_entry = None |
|
31 self.disallow_all = False |
|
32 self.allow_all = False |
|
33 self.set_url(url) |
|
34 self.last_checked = 0 |
|
35 |
|
36 def mtime(self): |
|
37 """Returns the time the robots.txt file was last fetched. |
|
38 |
|
39 This is useful for long-running web spiders that need to |
|
40 check for new robots.txt files periodically. |
|
41 |
|
42 """ |
|
43 return self.last_checked |
|
44 |
|
45 def modified(self): |
|
46 """Sets the time the robots.txt file was last fetched to the |
|
47 current time. |
|
48 |
|
49 """ |
|
50 import time |
|
51 self.last_checked = time.time() |
|
52 |
|
53 def set_url(self, url): |
|
54 """Sets the URL referring to a robots.txt file.""" |
|
55 self.url = url |
|
56 self.host, self.path = urlparse.urlparse(url)[1:3] |
|
57 |
|
58 def read(self): |
|
59 """Reads the robots.txt URL and feeds it to the parser.""" |
|
60 opener = URLopener() |
|
61 f = opener.open(self.url) |
|
62 lines = [] |
|
63 line = f.readline() |
|
64 while line: |
|
65 lines.append(line.strip()) |
|
66 line = f.readline() |
|
67 self.errcode = opener.errcode |
|
68 if self.errcode == 401 or self.errcode == 403: |
|
69 self.disallow_all = True |
|
70 _debug("disallow all") |
|
71 elif self.errcode >= 400: |
|
72 self.allow_all = True |
|
73 _debug("allow all") |
|
74 elif self.errcode == 200 and lines: |
|
75 _debug("parse lines") |
|
76 self.parse(lines) |
|
77 |
|
78 def _add_entry(self, entry): |
|
79 if "*" in entry.useragents: |
|
80 # the default entry is considered last |
|
81 self.default_entry = entry |
|
82 else: |
|
83 self.entries.append(entry) |
|
84 |
|
85 def parse(self, lines): |
|
86 """parse the input lines from a robots.txt file. |
|
87 We allow that a user-agent: line is not preceded by |
|
88 one or more blank lines.""" |
|
89 state = 0 |
|
90 linenumber = 0 |
|
91 entry = Entry() |
|
92 |
|
93 for line in lines: |
|
94 linenumber = linenumber + 1 |
|
95 if not line: |
|
96 if state==1: |
|
97 _debug("line %d: warning: you should insert" |
|
98 " allow: or disallow: directives below any" |
|
99 " user-agent: line" % linenumber) |
|
100 entry = Entry() |
|
101 state = 0 |
|
102 elif state==2: |
|
103 self._add_entry(entry) |
|
104 entry = Entry() |
|
105 state = 0 |
|
106 # remove optional comment and strip line |
|
107 i = line.find('#') |
|
108 if i>=0: |
|
109 line = line[:i] |
|
110 line = line.strip() |
|
111 if not line: |
|
112 continue |
|
113 line = line.split(':', 1) |
|
114 if len(line) == 2: |
|
115 line[0] = line[0].strip().lower() |
|
116 line[1] = urllib.unquote(line[1].strip()) |
|
117 if line[0] == "user-agent": |
|
118 if state==2: |
|
119 _debug("line %d: warning: you should insert a blank" |
|
120 " line before any user-agent" |
|
121 " directive" % linenumber) |
|
122 self._add_entry(entry) |
|
123 entry = Entry() |
|
124 entry.useragents.append(line[1]) |
|
125 state = 1 |
|
126 elif line[0] == "disallow": |
|
127 if state==0: |
|
128 _debug("line %d: error: you must insert a user-agent:" |
|
129 " directive before this line" % linenumber) |
|
130 else: |
|
131 entry.rulelines.append(RuleLine(line[1], False)) |
|
132 state = 2 |
|
133 elif line[0] == "allow": |
|
134 if state==0: |
|
135 _debug("line %d: error: you must insert a user-agent:" |
|
136 " directive before this line" % linenumber) |
|
137 else: |
|
138 entry.rulelines.append(RuleLine(line[1], True)) |
|
139 else: |
|
140 _debug("line %d: warning: unknown key %s" % (linenumber, |
|
141 line[0])) |
|
142 else: |
|
143 _debug("line %d: error: malformed line %s"%(linenumber, line)) |
|
144 if state==2: |
|
145 self.entries.append(entry) |
|
146 _debug("Parsed rules:\n%s" % str(self)) |
|
147 |
|
148 |
|
149 def can_fetch(self, useragent, url): |
|
150 """using the parsed robots.txt decide if useragent can fetch url""" |
|
151 _debug("Checking robots.txt allowance for:\n user agent: %s\n url: %s" % |
|
152 (useragent, url)) |
|
153 if self.disallow_all: |
|
154 return False |
|
155 if self.allow_all: |
|
156 return True |
|
157 # search for given user agent matches |
|
158 # the first match counts |
|
159 url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/" |
|
160 for entry in self.entries: |
|
161 if entry.applies_to(useragent): |
|
162 return entry.allowance(url) |
|
163 # try the default entry last |
|
164 if self.default_entry: |
|
165 return self.default_entry.allowance(url) |
|
166 # agent not found ==> access granted |
|
167 return True |
|
168 |
|
169 |
|
170 def __str__(self): |
|
171 ret = "" |
|
172 for entry in self.entries: |
|
173 ret = ret + str(entry) + "\n" |
|
174 return ret |
|
175 |
|
176 |
|
177 class RuleLine: |
|
178 """A rule line is a single "Allow:" (allowance==True) or "Disallow:" |
|
179 (allowance==False) followed by a path.""" |
|
180 def __init__(self, path, allowance): |
|
181 if path == '' and not allowance: |
|
182 # an empty value means allow all |
|
183 allowance = True |
|
184 self.path = urllib.quote(path) |
|
185 self.allowance = allowance |
|
186 |
|
187 def applies_to(self, filename): |
|
188 return self.path=="*" or filename.startswith(self.path) |
|
189 |
|
190 def __str__(self): |
|
191 return (self.allowance and "Allow" or "Disallow")+": "+self.path |
|
192 |
|
193 |
|
194 class Entry: |
|
195 """An entry has one or more user-agents and zero or more rulelines""" |
|
196 def __init__(self): |
|
197 self.useragents = [] |
|
198 self.rulelines = [] |
|
199 |
|
200 def __str__(self): |
|
201 ret = "" |
|
202 for agent in self.useragents: |
|
203 ret = ret + "User-agent: "+agent+"\n" |
|
204 for line in self.rulelines: |
|
205 ret = ret + str(line) + "\n" |
|
206 return ret |
|
207 |
|
208 def applies_to(self, useragent): |
|
209 """check if this entry applies to the specified agent""" |
|
210 # split the name token and make it lower case |
|
211 useragent = useragent.split("/")[0].lower() |
|
212 for agent in self.useragents: |
|
213 if agent=='*': |
|
214 # we have the catch-all agent |
|
215 return True |
|
216 agent = agent.lower() |
|
217 if agent in useragent: |
|
218 return True |
|
219 return False |
|
220 |
|
221 def allowance(self, filename): |
|
222 """Preconditions: |
|
223 - our agent applies to this entry |
|
224 - filename is URL decoded""" |
|
225 for line in self.rulelines: |
|
226 _debug((filename, str(line), line.allowance)) |
|
227 if line.applies_to(filename): |
|
228 return line.allowance |
|
229 return True |
|
230 |
|
231 class URLopener(urllib.FancyURLopener): |
|
232 def __init__(self, *args): |
|
233 urllib.FancyURLopener.__init__(self, *args) |
|
234 self.errcode = 200 |
|
235 |
|
236 def prompt_user_passwd(self, host, realm): |
|
237 ## If robots.txt file is accessible only with a password, |
|
238 ## we act as if the file wasn't there. |
|
239 return None, None |
|
240 |
|
241 def http_error_default(self, url, fp, errcode, errmsg, headers): |
|
242 self.errcode = errcode |
|
243 return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, |
|
244 errmsg, headers) |
|
245 |
|
246 def _check(a,b): |
|
247 if not b: |
|
248 ac = "access denied" |
|
249 else: |
|
250 ac = "access allowed" |
|
251 if a!=b: |
|
252 print "failed" |
|
253 else: |
|
254 print "ok (%s)" % ac |
|
255 print |
|
256 |
|
257 def _test(): |
|
258 global debug |
|
259 rp = RobotFileParser() |
|
260 debug = 1 |
|
261 |
|
262 # robots.txt that exists, gotten to by redirection |
|
263 rp.set_url('http://www.musi-cal.com/robots.txt') |
|
264 rp.read() |
|
265 |
|
266 # test for re.escape |
|
267 _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1) |
|
268 # this should match the first rule, which is a disallow |
|
269 _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0) |
|
270 # various cherry pickers |
|
271 _check(rp.can_fetch('CherryPickerSE', |
|
272 'http://www.musi-cal.com/cgi-bin/event-search' |
|
273 '?city=San+Francisco'), 0) |
|
274 _check(rp.can_fetch('CherryPickerSE/1.0', |
|
275 'http://www.musi-cal.com/cgi-bin/event-search' |
|
276 '?city=San+Francisco'), 0) |
|
277 _check(rp.can_fetch('CherryPickerSE/1.5', |
|
278 'http://www.musi-cal.com/cgi-bin/event-search' |
|
279 '?city=San+Francisco'), 0) |
|
280 # case sensitivity |
|
281 _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0) |
|
282 _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0) |
|
283 # substring test |
|
284 _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0) |
|
285 # tests for catch-all * agent |
|
286 _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0) |
|
287 _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1) |
|
288 _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1) |
|
289 _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1) |
|
290 |
|
291 # robots.txt that does not exist |
|
292 rp.set_url('http://www.lycos.com/robots.txt') |
|
293 rp.read() |
|
294 _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1) |
|
295 |
|
296 if __name__ == '__main__': |
|
297 _test() |