From 05b7ea1a464bf098068211b08b922ff329958e7b Mon Sep 17 00:00:00 2001 From: kerozene Date: Wed, 11 Nov 2015 21:57:42 +1100 Subject: [PATCH] SpiffyTitles: Implement Wikipedia extracts --- README.md | 24 ++++++++++++++- config.py | 26 +++++++++++++++++ plugin.py | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 134 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e9ed42c..a622365 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ The ONLY gluten-free plugin for displaying link titles. - Configurable template so you can decide how titles are displayed and what they say - Additional information about [Youtube](https://youtube.com) videos - Additional information about [imgur](https://imgur.com) links +- Article extracts from [Wikipedia](https://en.wikipedia.org) links - Rate limiting to mitigate abuse - Configurable white/black list to control where titles are disabled - Configurable list of user agents @@ -95,7 +96,7 @@ Example output: ^ Snoop Dogg - Pump Pump feat. Lil Malik uploaded by GeorgeRDR3218 @ 00:45:: Duration: 04:41 :: 203,218 views :: 933 likes :: 40 dislikes :: 0 favorites :: 112 comments -### Available variable for the Youtube template ### +### Available variables for the Youtube template ### Variable | Description ---------------|------------ @@ -184,6 +185,27 @@ Default value: `^ [{{ownerscreenname}}] {{title}} :: Duration: {{duration}} :: { `dailymotionHandlerEnabled` - Whether to enable additional information about dailymotion videos. +### wikipedia handler + +`wikipedia.enabled` - Whether to fetch extracts for Wikipedia articles. + +`wikipedia.extractTemplate` - Wikipedia template. + +Default value: "^ {{extract}}" + +`wikipedia.maxChars` - Extract will be cut to this length (including '...'). + +Default value: 240 + +`wikipedia.removeParentheses` - Whether to remove parenthesized text from output. + +`wikipedia.ignoreSectionLinks` - Whether to ignore links to specific article sections. + +`wikipedia.apiParams` - Add or override API query parameters with a space-separated list of key=value pairs. + +`wikipedia.titleParam` - The query parameter that will hold the page title from the URL. + + ## Other options `useBold` - Whether to bold the title. Default value: `False` diff --git a/config.py b/config.py index bdd6764..d2774d5 100644 --- a/config.py +++ b/config.py @@ -152,3 +152,29 @@ conf.registerChannelValue(SpiffyTitles, 'requireCapability', conf.registerChannelValue(SpiffyTitles, 'ignoredTitlePattern', registry.Regexp("", _("""Titles matching this pattern will be ignored."""))) + + +conf.registerGroup(SpiffyTitles, 'wikipedia') + +conf.registerChannelValue(SpiffyTitles.wikipedia, 'enabled', + registry.Boolean(True, _("""Whether to fetch extracts for Wikipedia articles."""))) + +conf.registerChannelValue(SpiffyTitles.wikipedia, 'apiParams', + registry.SpaceSeparatedListOfStrings([], _("""Add or override API query parameters with a space-separated list of key=value pairs."""))) + +conf.registerChannelValue(SpiffyTitles.wikipedia, 'titleParam', + registry.String("titles", _("""The query parameter that will hold the page title from the URL."""))) + +# Ideally, links to specific article sections would produce the relevant output for that section. This is not currently implemented. +conf.registerChannelValue(SpiffyTitles.wikipedia, 'ignoreSectionLinks', + registry.Boolean(True, _("""Ignore links to specific article sections."""))) + +conf.registerChannelValue(SpiffyTitles.wikipedia, 'maxChars', + registry.Integer(240, _("""Extract will be cut to this length (including '...')."""))) + +# Remove parenthesized text from output. +conf.registerChannelValue(SpiffyTitles.wikipedia, 'removeParentheses', + registry.Boolean(True, _("""Remove parenthesized text from output."""))) + +conf.registerChannelValue(SpiffyTitles.wikipedia, 'extractTemplate', + registry.String("^ {{extract}}", _("""Wikipedia template."""))) diff --git a/plugin.py b/plugin.py index cb42a90..6504daf 100644 --- a/plugin.py +++ b/plugin.py @@ -15,10 +15,10 @@ import supybot.callbacks as callbacks import re import requests try: - from urlparse import urlparse from urllib import urlencode + from urlparse import urlparse, parse_qsl except ImportError: - from urllib.parse import urlencode, urlparse + from urllib.parse import urlencode, urlparse, parse_qsl from bs4 import BeautifulSoup import random import json @@ -68,6 +68,7 @@ class SpiffyTitles(callbacks.Plugin): self.add_coub_handlers() self.add_vimeo_handlers() self.add_dailymotion_handlers() + self.add_wikipedia_handlers() def add_dailymotion_handlers(self): self.handlers["www.dailymotion.com"] = self.handler_dailymotion @@ -78,6 +79,9 @@ class SpiffyTitles(callbacks.Plugin): def add_coub_handlers(self): self.handlers["coub.com"] = self.handler_coub + def add_wikipedia_handlers(self): + self.handlers["en.wikipedia.org"] = self.handler_wikipedia + def handler_dailymotion(self, url, info, channel): """ Handles dailymotion links @@ -821,6 +825,85 @@ class SpiffyTitles(callbacks.Plugin): return self.handler_default(url, channel) + def handler_wikipedia(self, url, domain, channel): + """ + Queries wikipedia API for article extracts. + """ + wikipedia_handler_enabled = self.registryValue("wikipedia.enabled", channel=channel) + if not wikipedia_handler_enabled: + return self.handler_default(url, channel) + + self.log.debug("SpiffyTitles: calling Wikipedia handler for %s" % (url)) + + pattern = r"/(?:w(?:iki))/(?P[^/]+)$" + info = urlparse(url) + match = re.search(pattern, info.path) + if not match: + self.log.debug("SpiffyTitles: no title found.") + return self.handler_default(url, channel) + elif info.fragment and self.registryValue("wikipedia.ignoreSectionLinks", channel=channel): + self.log.debug("SpiffyTitles: ignoring section link.") + return self.handler_default(url, channel) + else: + page_title = match.groupdict()['page'] + + default_api_params = { + "format": "json", + "action": "query", + "prop": "extracts", + "exsentences": "2", + "exlimit": "1", + "exintro": "", + "explaintext": "" + } + extra_params = dict(parse_qsl('&'.join(self.registryValue("wikipedia.apiParams", channel=channel)))) + title_param = { self.registryValue("wikipedia.titleParam", channel=channel): page_title } + + # merge dicts + api_params = default_api_params.copy() + api_params.update(extra_params) + api_params.update(title_param) + api_url = "https://en.wikipedia.org/w/api.php?%s" % ('&'.join("%s=%s" % (key, val) for (key,val) in api_params.iteritems())) + + agent = self.get_user_agent() + headers = { + "User-Agent": agent + } + extract = "" + + self.log.debug("SpiffyTitles: requesting %s" % (api_url)) + + request = requests.get(api_url, headers=headers) + ok = request.status_code == requests.codes.ok + + if ok: + response = json.loads(request.text) + + if response: + try: + extract = response['query']['pages'].values()[0]['extract'] + except KeyError as e: + self.log.error("SpiffyTitles: KeyError parsing Wikipedia API JSON response: %s" % (str(e))) + else: + self.log.error("SpiffyTitles: Error parsing Wikipedia API JSON response") + else: + self.log.error("SpiffyTitles: Wikipedia API HTTP %s: %s" % (request.status_code, request.text)) + + if extract: + if (self.registryValue("wikipedia.removeParentheses")): + extract = re.sub(r' ?\([^)]*\)', '', extract) + max_chars = self.registryValue("wikipedia.maxChars", channel=channel) + if len(extract) > max_chars: + extract = extract[:max_chars - 3].rsplit(' ', 1)[0].rstrip(',.') + '...' + + wikipedia_template = Template(self.registryValue("wikipedia.extractTemplate", channel=channel)) + return wikipedia_template.render({"extract": extract}) + else: + self.log.debug("SpiffyTitles: falling back to default handler") + + return self.handler_default(url, channel) + + def is_valid_imgur_id(self, input): """ Tests if input matches the typical imgur id, which seems to be alphanumeric. Images, galleries,