SpiffyTitles: Implement Wikipedia extracts

2015-11-11 21:57:42 +11:00 · 2015-11-11 21:57:42 +11:00 · 05b7ea1a46
parent 80ca56d870
commit 05b7ea1a46
3 changed files with 134 additions and 3 deletions
--- a/README.md
+++ b/README.md
@ -7,6 +7,7 @@ The ONLY gluten-free plugin for displaying link titles.
 - Configurable template so you can decide how titles are displayed and what they say
 - Additional information about [Youtube](https://youtube.com) videos
 - Additional information about [imgur](https://imgur.com) links
+- Article extracts from [Wikipedia](https://en.wikipedia.org) links
 - Rate limiting to mitigate abuse
 - Configurable white/black list to control where titles are disabled
 - Configurable list of user agents
@ -95,7 +96,7 @@ Example output:

    ^ Snoop Dogg - Pump Pump feat. Lil Malik uploaded by GeorgeRDR3218 @ 00:45:: Duration: 04:41 :: 203,218 views :: 933 likes :: 40 dislikes :: 0 favorites :: 112 comments

-### Available variable for the Youtube template ###
+### Available variables for the Youtube template ###

 Variable       | Description
 ---------------|------------
@ -184,6 +185,27 @@ Default value: `^ [{{ownerscreenname}}] {{title}} :: Duration: {{duration}} :: {

 `dailymotionHandlerEnabled` - Whether to enable additional information about dailymotion videos.

+### wikipedia handler
+
+`wikipedia.enabled` - Whether to fetch extracts for Wikipedia articles.
+
+`wikipedia.extractTemplate` - Wikipedia template.
+
+Default value: "^ {{extract}}"
+
+`wikipedia.maxChars` - Extract will be cut to this length (including '...').
+
+Default value: 240
+
+`wikipedia.removeParentheses` - Whether to remove parenthesized text from output.
+
+`wikipedia.ignoreSectionLinks` - Whether to ignore links to specific article sections.
+
+`wikipedia.apiParams` - Add or override API query parameters with a space-separated list of key=value pairs.
+
+`wikipedia.titleParam` - The query parameter that will hold the page title from the URL.
+
+
 ## Other options

 `useBold` - Whether to bold the title. Default value: `False`
--- a/config.py
+++ b/config.py
@ -152,3 +152,29 @@ conf.registerChannelValue(SpiffyTitles, 'requireCapability',

 conf.registerChannelValue(SpiffyTitles, 'ignoredTitlePattern',
                        registry.Regexp("", _("""Titles matching this pattern will be ignored.""")))
+
+
+conf.registerGroup(SpiffyTitles, 'wikipedia')
+
+conf.registerChannelValue(SpiffyTitles.wikipedia, 'enabled',
+                        registry.Boolean(True, _("""Whether to fetch extracts for Wikipedia articles.""")))
+
+conf.registerChannelValue(SpiffyTitles.wikipedia, 'apiParams',
+                        registry.SpaceSeparatedListOfStrings([], _("""Add or override API query parameters with a space-separated list of key=value pairs.""")))
+
+conf.registerChannelValue(SpiffyTitles.wikipedia, 'titleParam',
+                        registry.String("titles", _("""The query parameter that will hold the page title from the URL.""")))
+
+# Ideally, links to specific article sections would produce the relevant output for that section. This is not currently implemented.
+conf.registerChannelValue(SpiffyTitles.wikipedia, 'ignoreSectionLinks',
+                        registry.Boolean(True, _("""Ignore links to specific article sections.""")))
+
+conf.registerChannelValue(SpiffyTitles.wikipedia, 'maxChars',
+                        registry.Integer(240, _("""Extract will be cut to this length (including '...').""")))
+
+# Remove parenthesized text from output.
+conf.registerChannelValue(SpiffyTitles.wikipedia, 'removeParentheses',
+                        registry.Boolean(True, _("""Remove parenthesized text from output.""")))
+
+conf.registerChannelValue(SpiffyTitles.wikipedia, 'extractTemplate',
+                        registry.String("^ {{extract}}", _("""Wikipedia template.""")))
--- a/plugin.py
+++ b/plugin.py
@ -15,10 +15,10 @@ import supybot.callbacks as callbacks
 import re
 import requests
 try:
-    from urlparse import urlparse
    from urllib import urlencode
+    from urlparse import urlparse, parse_qsl
 except ImportError:
-    from urllib.parse import urlencode, urlparse
+    from urllib.parse import urlencode, urlparse, parse_qsl
 from bs4 import BeautifulSoup
 import random
 import json
@ -68,6 +68,7 @@ class SpiffyTitles(callbacks.Plugin):
        self.add_coub_handlers()
        self.add_vimeo_handlers()
        self.add_dailymotion_handlers()
+        self.add_wikipedia_handlers()
    
    def add_dailymotion_handlers(self):
        self.handlers["www.dailymotion.com"] = self.handler_dailymotion
@ -78,6 +79,9 @@ class SpiffyTitles(callbacks.Plugin):
    def add_coub_handlers(self):
        self.handlers["coub.com"] = self.handler_coub
    
+    def add_wikipedia_handlers(self):
+        self.handlers["en.wikipedia.org"] = self.handler_wikipedia
+
    def handler_dailymotion(self, url, info, channel):
        """
        Handles dailymotion links
@ -821,6 +825,85 @@ class SpiffyTitles(callbacks.Plugin):
            
            return self.handler_default(url, channel)
    
+    def handler_wikipedia(self, url, domain, channel):
+        """
+        Queries wikipedia API for article extracts.
+        """
+        wikipedia_handler_enabled = self.registryValue("wikipedia.enabled", channel=channel)
+        if not wikipedia_handler_enabled:
+            return self.handler_default(url, channel)
+
+        self.log.debug("SpiffyTitles: calling Wikipedia handler for %s" % (url))
+
+        pattern = r"/(?:w(?:iki))/(?P<page>[^/]+)$"
+        info = urlparse(url)
+        match = re.search(pattern, info.path)
+        if not match:
+            self.log.debug("SpiffyTitles: no title found.")
+            return self.handler_default(url, channel)
+        elif info.fragment and self.registryValue("wikipedia.ignoreSectionLinks", channel=channel):
+            self.log.debug("SpiffyTitles: ignoring section link.")
+            return self.handler_default(url, channel)
+        else:
+            page_title = match.groupdict()['page']
+
+        default_api_params = {
+            "format":      "json",
+            "action":      "query",
+            "prop":        "extracts",
+            "exsentences": "2",
+            "exlimit":     "1",
+            "exintro":     "",
+            "explaintext": ""
+        }
+        extra_params = dict(parse_qsl('&'.join(self.registryValue("wikipedia.apiParams", channel=channel))))
+        title_param  = { self.registryValue("wikipedia.titleParam", channel=channel): page_title }
+
+        # merge dicts
+        api_params = default_api_params.copy()
+        api_params.update(extra_params)
+        api_params.update(title_param)
+        api_url = "https://en.wikipedia.org/w/api.php?%s" % ('&'.join("%s=%s" % (key, val) for (key,val) in api_params.iteritems()))
+
+        agent = self.get_user_agent()
+        headers = {
+            "User-Agent": agent
+        }
+        extract = ""
+
+        self.log.debug("SpiffyTitles: requesting %s" % (api_url))
+
+        request = requests.get(api_url, headers=headers)            
+        ok = request.status_code == requests.codes.ok
+        
+        if ok:
+            response = json.loads(request.text)
+            
+            if response:
+                try:
+                    extract = response['query']['pages'].values()[0]['extract']
+                except KeyError as e:
+                    self.log.error("SpiffyTitles: KeyError parsing Wikipedia API JSON response: %s" % (str(e)))
+            else:
+                self.log.error("SpiffyTitles: Error parsing Wikipedia API JSON response")
+        else:
+            self.log.error("SpiffyTitles: Wikipedia API HTTP %s: %s" % (request.status_code, request.text))
+
+        if extract:
+            if (self.registryValue("wikipedia.removeParentheses")):
+                extract = re.sub(r' ?\([^)]*\)', '', extract)
+            max_chars = self.registryValue("wikipedia.maxChars", channel=channel)
+            if len(extract) > max_chars:
+                extract = extract[:max_chars - 3].rsplit(' ', 1)[0].rstrip(',.') + '...'
+
+            wikipedia_template = Template(self.registryValue("wikipedia.extractTemplate", channel=channel))
+            return wikipedia_template.render({"extract": extract})
+        else:
+            self.log.debug("SpiffyTitles: falling back to default handler")
+            
+            return self.handler_default(url, channel)
+
+
    def is_valid_imgur_id(self, input):
        """
        Tests if input matches the typical imgur id, which seems to be alphanumeric. Images, galleries,