bots = '''
Acoon-Robot
Gigabot
Googlebot
msnbot
msnbot-media/1.0
Teoma
Slurp
aipbot
ia_archiver
Alexibot
Aqua_Products
asterias
b2w/0.1
BackDoorBot/1.0
becomebot
Bloglines/3.
BlowFish/1.0
Bookmark search tool
BotALot
BotRightHere
BuiltBotTough
Bullseye/1.0
BunnySlippers
CCBot/1.0
CheeseBot
CherryPicker
CherryPickerElite/1.0
CherryPickerSE/1.0
Copernic
CopyRightCheck
DittoSpyder
EmailCollector
EmailSiphon
EmailWolf
EroCrawler
ExtractorPro
FairAd Client
Fasterfox
Flaming AttackBot
Foobot
Gaisbot
GetRight/4.2
Harvest/1.5
hloader
HTTrack 3.0
humanlinks
IconSurf
InfoNaviRobot
Iron33/1.0.2
Jakarta Commons-HttpClient
JennyBot
Kenjin Spider
Keyword Density/0.9
larbin
LexiBot
libWeb/clsHTTP
LinkextractorPro
LinkScan/8.1a Unix
LinkWalker
LNSpiderguy
lwp-trivial
Mata Hari
Microsoft URL Control
MIIxpc
MIIxpc/4.2
Mister PiX
moget
MSIECrawler
NetAnts
NICErsPRO
Offline Explorer
Openbot
Openfind
Openfind data gatherer
Oracle Ultra Search
PerMan
ProPowerBot/2.14
ProWebWalker
psbot
Python-urllib
QueryN Metasearch
Radiation Retriever 1.1
RepoMonkey
RepoMonkey Bait & Tackle/v1.01
RMA
searchpreview
SiteSnagger
SpankBot
spanner
SurveyBot
suzuran
Szukacz/1.4
Teleport
TeleportPro
Telesoft
The Intraformant
TheNomad
TightTwatBot
toCrawl/UrlDispatcher
True_Robot
turingos
TurnitinBot
URL Control
URL_Spider_Pro
URLy Warning
VCI
VCI WebViewer VCI WebViewer Win32
Web Image Collector
WebAuto
WebBandit
WebCapture 2.0
WebCopier
WebEnhancer
WebSauger
Website Quester
Webster Pro
WebStripper
WebZip
Wget
WWW-Collector-E
Zeus
Zeus Link Scout
'''.strip().splitlines()
__all__=['is_bot_user_agent']
import re
_regex = re.compile(r'\b(%s)\b' % '|'.join([re.escape(x) for x in bots]))
def is_bot_user_agent(user_agent):
return bool(_regex.findall(user_agent))
if __name__=='__main__':
import unittest
class UserAgentTestCase(unittest.TestCase):
def test_batch(self):
tests = [
(False, 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080419 Ubuntu/8.04 (hardy) Firefox/2.0.0.14'),
(False, 'Mozilla/5.0 (compatible; Konqueror/4.0; Linux) KHTML/4.0.3 (like Gecko)'),
(True, 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'),
(True, 'msnbot-media/1.0 (+http://search.msn.com/msnbot.htm)'),
(True, 'Wget/1.10.2'),
(True, 'Mozilla/5.0 (compatible; Yahoo! Slurp/3.0; http://help.yahoo.com/help/us/ysearch/slurp)'),
(True, 'Mozilla/5.0 (compatible; Ask Jeeves/Teoma; +http://about.ask.com/en/docs/about/webmasters.shtml)'),
(True, 'msnbot/1.1 (+http://search.msn.com/msnbot.htm)'),
(False, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.8.1.14) Gecko/20080404 Firefox/2.0.0.14'),
(True, 'Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)'),
(False, ''),
(True, 'Bloglines/3.1 (http://www.bloglines.com; 1 subscriber)'),
(False, 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2.1; aggregator:Tailrank (Spinn3r 2.1); http://spinn3r.com/robot) Gecko/20021130'),
(False, 'curl/7.18.0 (i486-pc-linux-gnu) libcurl/7.18.0 OpenSSL/0.9.8g zlib/1.2.3.3 libidn/1.1'),
(False, 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.4.4'),
(False, 'User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2)'),
(True, 'Nokia6820/2.0 (4.83) Profile/MIDP-1.0 Configuration/CLDC-1.0 (compatible; Googlebot-Mobile/2.1; +http://www.google.com/bot.html)'),
(False, 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; TencentTraveler ; .NET CLR 1.1.4322; .NET CLR 2.0.50727)'),
(False, 'Yandex/1.01.001 (compatible; Win16; I)'),
(False, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.14) Gecko/20080404 Firefox/2.0.0.14'),
(False, 'Mozilla/5.0 (Twiceler-0.9 http://www.cuill.com/twiceler/robot.html)'),
(False, 'Feedfetcher-Google; (+http://www.google.com/feedfetcher.html; 3 subscribers; feed-id=10163738114753035636)'),
(False, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7) Gecko/20040815 Firefox/0.8 (MOOX M3)'),
(False, 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)'),
(False, 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; .NET CLR 1.1.4322)'),
(False, 'Site 24 X 7 RPT-HTTPClient/0.3-3E'),
(False, 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_4_11; en) AppleWebKit/525.18 (KHTML, like Gecko) Version/3.1.1 Safari/525.18'),
(False, 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'),
(False, 'Mozilla/5.0 (compatible; Google Desktop)'),
(False, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; pt-BR; rv:1.9b5) Gecko/2008032620 Firefox/3.0b5'),
(False, 'Mozilla/4.0 (compatible;)'),
(False, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; pt-BR; rv:1.8.1.14) Gecko/20080404 Firefox/2.0.0.14'),
(False, 'Feedfetcher-Google; (+http://www.google.com/feedfetcher.html; 2 subscribers; feed-id=7862281799759567937)'),
(False, 'Yandex/1.01.001 (compatible; Win16; H)'),
(False, 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.8.1.14) Gecko/20080404 Firefox/2.0.0.14'),
(False, 'Mozilla/5.0 (compatible; YodaoBot/1.0; http://www.yodao.com/help/webmaster/spider/; )'),
(False, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; nb-NO; rv:1.8.1.14) Gecko/20080404 Firefox/2.0.0.14'),
(False, 'YandexSomething/1.0'),
(False, 'YandexBlog/0.99.101 (compatible; DOS3.30; Mozilla/5.0; B; robot) 0 readers'),
(False, 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; EmbeddedWB 14,52 from: http://www.bsalsa.com/ Embedded Web Browser from: http://bsalsa.com/; .NET CLR 1.1.4322; .NET CLR 2.0.50727)'),
(False, 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9b5) Gecko/2008050509 Firefox/3.0b5'),
(False, 'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0'),
(False, 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)'),
(True, 'Jakarta Commons-HttpClient/3.1'),
(False, 'Nokia6682/2.0 (3.01.1) SymbianOS/8.0 Series60/2.6 Profile/MIDP-2.0 configuration/CLDC-1.1 UP.Link/6.3.0.0.0 (compatible;YahooSeeker/M1A1-R2D2; http://help.yahoo.com/help/us/ysearch/crawling/crawling-01.html)'),
(True, 'Acoon-Robot 4.0.2.17 (http://www.acoon.de)'),
(False, 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; fr-fr) AppleWebKit/312.5 (KHTML, like Gecko) Safari/312.3'),
(True, 'CCBot/1.0 (+http://www.commoncrawl.org/bot.html)'),
(False, 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; WOW64; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)'),
(True, 'Googlebot-Image/1.0'),
(False, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.8.1) VoilaBot BETA 1.2 (support.voilabot@orange-ftgroup.com)'),
(False, 'Mozilla/5.0(Windows;N;Win98;m18)Gecko/20010124'),
(False, 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)'),
(False, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; pt-BR; rv:1.8.1.14) Gecko/20080404 Firefox/2.0.0.14,gzip(gfe) (via translate.google.com)'),
(False, 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/51 (like Gecko) Safari/51'),
(False, 'Mozilla/5.0 (compatible; Exabot-Images/3.0; +http://www.exabot.com/go/robot)'),
(False, 'Mozilla/4.0 (compatible; MSIE 6.0; Windows XP)'),
(False, 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.12) Gecko/20080208 Fedora/2.0.0.12-1.fc8 Firefox/2.0.0.12'),
(False, 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; InfoPath.1; .NET CLR 3.0.04506.648)'),
(False, 'YahooFeedSeeker Testing/2.0 (compatible; Mozilla 4.0; MSIE 5.5; http://publisher.yahoo.com/rssguide; users 1; views 176)'),
(False, 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;1813)'),
(False, 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'),
(False, 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; ja-JP-mac; rv:1.9b5) Gecko/2008032619 Firefox/3.0b5'),
(False, 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0)'),
(False, 'Mozilla/5.0 (Windows; U; WinNT4.0; en-US; rv:1.2) Gecko/20021126'),
(False, 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727)'),
(False, 'Java/1.4.1_04'),
]
for expect, user_agent in tests:
self.assertEqual(expect, is_bot_user_agent(user_agent))
def suite():
return unittest.makeSuite(UserAgentTestCase)
unittest.main()
|