Ticket #138: urlinfo.py

File urlinfo.py, 4.8 KB (added by raspi, 5 months ago)

urlinfo.py plugin

Line 
1# urlinfo.py
2# -*- coding: utf-8 -*-
3
4"""
5Catches URLs on channel and gives information about them like title, image size, etc.
6Uses http://whatisthisfile.appspot.com/ via XMLRPC
7
8Example:
919:20 <@raspi> http://www.youtube.com/watch?v=9RZ-hYPAMFQ
1019:20 <@bot> Title: "YouTube - Black Knight Holy Grail"
1119:28 <@raspi> test http://www.raspi.fi foobar http://raspi.fi/wp-includes/images/rss.png
1219:28 <@bot> 1. Title: "raspi.fi" Redirect: http://raspi.fi/  2. Image: 14x14
13"""
14
15__author__ = u"Pekka 'raspi' JÀrvinen - http://raspi.fi/"
16__license__ = 'BSD'
17
18from gozerbot.generic import handle_exception, rlog
19from gozerbot.callbacks import callbacks
20from gozerbot.commands import cmnds
21from gozerbot.plughelp import plughelp
22from gozerbot.persist.persist import Persist
23from gozerbot.examples import examples
24
25
26import re
27import urlparse
28import xmlrpclib
29import socket
30
31plughelp.add('urlinfo', 'Gets information about URLs spoken on channel')
32
33cfg = Persist('urlinfo', {})
34
35
36# Remove non-urls word by word
37def sanitize(text):
38  text = text.strip()
39
40  # Remove extra space
41  text = re.sub('\s\s+', ' ', text)
42
43  tmp = ''
44  for i in text.split(' '):
45    if len(i) >= 5:
46      if i.find('www.') != -1 or i.find('http') != -1:
47        # String has to contain www. or http somewhere
48        tmp += i + ' '
49
50  tmp = tmp.strip();
51 
52  tmp2 = ''
53  for i in tmp.split(' '):
54    if (i[0] == '(' and i[-1] == ')') or (i[0] == '[' and i[-1] == ']') or (i[0] == '<' and i[-1] == '>') or (i[0] == '{' and i[-1] == '}'):
55      # First and last character is one of ()[]{}<>
56      tmp2 += i[1:-1:1] + ' '
57    else:
58      tmp2 += i + ' '
59
60  tmp2 = tmp2.strip();
61  tmp = ''
62  for i in tmp2.split(' '):
63    if i.find('www.') == 0:
64      # Add http:// to beginning of string
65      tmp += 'http://' + i + ' '
66    else:
67      tmp += i + ' '
68
69  tmp = tmp.strip();
70  out = tmp;
71
72  return out;
73
74# Get valid URLs
75def getUrls(text):
76  regex = r"http[s]?://[-A-Za-z0-9+&@#/%?=~_()|!:,.;]*[-A-Za-z0-9+&@#/%=~_()|]"
77  p = re.compile(regex)
78  urls = []
79 
80  for i in text.split(' '):
81    for x in p.findall(i):
82      url = urlparse.urlparse(x)
83      if url.geturl() not in urls:
84        urls.append(url.geturl())
85     
86  return urls
87
88# Get URL information
89def getUrlInfo(text):
90  out = ''
91  text = sanitize(text)
92  urls = getUrls(text)
93
94  if len(urls):
95    idx = 1
96    for i in urls:
97      o = ''
98      try:
99        socket.setdefaulttimeout(30)
100
101        server = xmlrpclib.ServerProxy("http://whatisthisfile.appspot.com/xmlrpc")
102        rlog(10, 'urlinfo', "XMLRPC query: %s" % i)
103        urlinfo = server.app.query(i)
104
105        if urlinfo.has_key('html'):
106          if urlinfo['html'].has_key('title'):
107            o += 'Title: "%s" ' % urlinfo['html']['title'].strip()
108        elif urlinfo.has_key('image'):
109          o += 'Image: %dx%d ' % (urlinfo['image']['width'], urlinfo['image']['height'])
110
111        if urlinfo.has_key('real_url'):
112          if urlinfo['real_url'] != i:
113            o += 'Redirect: %s ' % (urlinfo['real_url'])
114
115        if len(o):
116          if len(urls) > 1:
117            out += ' ' + str(idx) + '. '
118            idx += 1
119
120          out += o
121
122      except Exception:
123        pass
124  return out.strip()
125
126# Catch channel chat for possible URLs
127def catchHasUrls(bot, ievent):
128  if cfg.data.has_key(bot.name) and cfg.data[bot.name].has_key(ievent.printto) and cfg.data[bot.name][ievent.printto]:
129    if len(ievent.txt) >= 5:
130      if (ievent.txt.find('www.') != -1) or (ievent.txt.find('http') != -1):
131        return 1
132  return 0 
133
134# Catch channel chat
135def catchUrls(bot, ievent):
136  ievent.reply(getUrlInfo(ievent.txt))
137
138callbacks.add('PRIVMSG', catchUrls, catchHasUrls, threaded=True)
139
140# Enable on channel
141def handle_urlinfo_enable(bot, ievent):
142  if not cfg.data.has_key(bot.name):
143    cfg.data[bot.name] = {}
144  cfg.data[bot.name][ievent.printto] = True
145  cfg.save()
146  ievent.reply('urlinfo enabled')
147
148cmnds.add('urlinfo-enable', handle_urlinfo_enable, 'OPER')
149examples.add('urlinfo-enable', 'enable urlinfo in the channel', 'urlinfo-enable')
150
151# Disable on channel
152def handle_urlinfo_disable(bot, ievent):
153   if cfg.data.has_key(bot.name):
154     cfg.data[bot.name][ievent.printto] = False
155     cfg.save()
156   ievent.reply('urlinfo disabled')
157
158cmnds.add('urlinfo-disable', handle_urlinfo_disable, 'OPER')
159examples.add('urlinfo-disable', 'disable urlinfo in the channel', 'urlinfo-disable')
160
161def handle_urlinfo_list(bot, ievent):
162  chans = []
163  names = cfg.data.keys()
164  names.sort()
165 
166  for name in names:
167    targets = cfg.data[name].keys()
168    targets.sort()
169    chans.append('%s: %s' % (name, ' '.join(targets)))
170  if not chans:
171    ievent.reply('none')
172  else:
173    ievent.reply('urlinfo enabled on channels: %s' % ', '.join(chans))
174
175cmnds.add('urlinfo-list', handle_urlinfo_list, 'OPER')
176examples.add('urlinfo-list', 'show in which channels urlinfo is enabled', 'urlinfo-list')