From 49df8e850a6cfa77f0aaf3fc9684cd8684efeb33 Mon Sep 17 00:00:00 2001
From: oddluck <39967334+oddluck@users.noreply.github.com>
Date: Wed, 4 Mar 2020 17:05:38 +0000
Subject: [PATCH] SpiffyTitles: file type/size, fake-useragent, badLinkText

---
 SpiffyTitles/README.md        |  8 ++--
 SpiffyTitles/config.py        |  4 +-
 SpiffyTitles/plugin.py        | 73 +++++++++++++++--------------------
 SpiffyTitles/requirements.txt |  1 +
 4 files changed, 38 insertions(+), 48 deletions(-)

diff --git a/SpiffyTitles/README.md b/SpiffyTitles/README.md
index bf98ff6..b6afd7a 100644
--- a/SpiffyTitles/README.md
+++ b/SpiffyTitles/README.md
@@ -285,12 +285,14 @@ improving performance. Default value: `60`
 `wallClockTimeoutInSeconds` - Timeout for total elapsed time when retrieving a title. If you set this value too 
 high, the bot may time out. Default value: `8` (seconds). You must `!reload SpiffyTitles` for this setting to take effect.
 
-`channelWhitelist` - a comma separated list of channels in which titles should be displayed. If `""`,
+`channelWhitelist` - A comma separated list of channels in which titles should be displayed. If `""`,
 titles will be shown in all channels. Default value: `""`
 
-`channelBlacklist` - a comma separated list of channels in which titles should never be displayed. If `""`,
+`channelBlacklist` - A comma separated list of channels in which titles should never be displayed. If `""`,
 titles will be shown in all channels. Default value: `""`
 
+`badLinkText` - The text to return when unable to retrieve a title from a URL. Default value: `Nice link idiot.`
+
 ### About white/black lists
 - Channel names must be in lowercase
 - If `channelWhitelist` and `channelBlacklist` are empty, then titles will be displayed in every channel
@@ -337,8 +339,6 @@ Ignore all links except youtube, imgur, and reddit
 
     !config supybot.plugins.SpiffyTitles.whitelistDomainPattern /(reddit\.com|youtube\.com|youtu\.be|imgur\.com)/
 
-`userAgents` - A comma separated list of strings of user agents randomly chosen when requesting. 
-
 `urlRegularExpression` - A regular expression used to match URLs. You shouldn't need to change this.
 
 `linkMessageIgnorePattern` - If a message matches this pattern, it will be ignored. This differs from `ignoredDomainPattern` in that it compares against the entire message rather than just the domain.
diff --git a/SpiffyTitles/config.py b/SpiffyTitles/config.py
index ea44937..e7411e2 100644
--- a/SpiffyTitles/config.py
+++ b/SpiffyTitles/config.py
@@ -69,8 +69,8 @@ conf.registerChannelValue(SpiffyTitles, 'useBold',
      registry.Boolean(False, _("""Use bold in titles""")))
 
 # User agents
-conf.registerGlobalValue(SpiffyTitles, 'userAgents',
-     registry.CommaSeparatedListOfStrings(["Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.60 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0", "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko"], _("""Reported user agent when fetching links""")))
+conf.registerGlobalValue(SpiffyTitles, 'badLinkText',
+     registry.String("Nice link idiot.", _("""Title to return for bad/unsnarfable links.""")))
 
 # Mime Types
 conf.registerGlobalValue(SpiffyTitles, 'mimeTypes',
diff --git a/SpiffyTitles/plugin.py b/SpiffyTitles/plugin.py
index 3cc5a7a..dbafc3a 100644
--- a/SpiffyTitles/plugin.py
+++ b/SpiffyTitles/plugin.py
@@ -48,6 +48,7 @@ import unicodedata
 import supybot.ircdb as ircdb
 import supybot.log as log
 import pendulum
+from fake_useragent import UserAgent
 
 try:
     from supybot.i18n import PluginInternationalization
@@ -117,7 +118,7 @@ class SpiffyTitles(callbacks.Plugin):
         self.handlers["www.twitch.tv"] = self.handler_twitch
         self.handlers["go.twitch.tv"] = self.handler_twitch
         self.handlers["clips.twitch.tv"] = self.handler_twitch
-        
+
     def add_imdb_handlers(self):
         """
         Enables meta info about IMDB links through the OMDB API
@@ -158,11 +159,7 @@ class SpiffyTitles(callbacks.Plugin):
             fields = "id,title,owner.screenname,duration,views_total"
             api_url = "https://api.dailymotion.com/video/%s?fields=%s" % (video_id, fields)
             log.debug("SpiffyTitles: looking up dailymotion info: %s", api_url)
-            agent = self.get_user_agent()
-            headers = {
-                "User-Agent": agent
-            }
-
+            headers = self.get_headers()
             request = requests.get(api_url, headers=headers)
 
             ok = request.status_code == requests.codes.ok
@@ -213,11 +210,7 @@ class SpiffyTitles(callbacks.Plugin):
             if video_id is not None:
                 api_url = "https://vimeo.com/api/v2/video/%s.json" % video_id
                 log.debug("SpiffyTitles: looking up vimeo info: %s", api_url)
-                agent = self.get_user_agent()
-                headers = {
-                    "User-Agent": agent
-                }
-
+                headers = self.get_headers()
                 request = requests.get(api_url, headers=headers)
 
                 ok = request.status_code == requests.codes.ok
@@ -279,11 +272,7 @@ class SpiffyTitles(callbacks.Plugin):
                 video_id = video_id.split("?")[0]
 
             api_url = "http://coub.com/api/v2/coubs/%s" % video_id
-            agent = self.get_user_agent()
-            headers = {
-                "User-Agent": agent
-            }
-
+            headers = self.get_headers()
             request = requests.get(api_url, headers=headers)
 
             ok = request.status_code == requests.codes.ok
@@ -491,7 +480,7 @@ class SpiffyTitles(callbacks.Plugin):
         except Exception as e:
             pass
 
-        if title is not None and title:
+        if title:
             irc.reply(title)
         else:
             irc.reply(error_message + " {}".format(err))
@@ -663,10 +652,7 @@ class SpiffyTitles(callbacks.Plugin):
             }
             encoded_options = urlencode(options)
             api_url = "https://www.googleapis.com/youtube/v3/videos?%s" % (encoded_options)
-            agent = self.get_user_agent()
-            headers = {
-                "User-Agent": agent
-            }
+            headers = self.get_headers()
 
             log.debug("SpiffyTitles: requesting %s" % (api_url))
 
@@ -886,10 +872,7 @@ class SpiffyTitles(callbacks.Plugin):
         if not match:
             self.log.debug("SpiffyTitles: twitch - no title found.")
             return self.handler_default(url, channel)
-        agent = self.get_user_agent()
-        headers = {
-            "Client-ID": twitch_client_id
-        }
+        headers = self.get_headers()
         self.log.debug("SpiffyTitles: twitch - requesting %s" % (data_url))
         request = requests.get(data_url, timeout=10, headers=headers)
         ok = request.status_code == requests.codes.ok
@@ -1212,11 +1195,7 @@ class SpiffyTitles(callbacks.Plugin):
         api_params.update(title_param)
         param_string = "&".join("%s=%s" % (key, val) for (key, val) in api_params.items())
         api_url = "https://%s/w/api.php?%s" % (info.netloc, param_string)
-
-        agent = self.get_user_agent()
-        headers = {
-            "User-Agent": agent
-        }
+        headers = self.get_headers()
         extract = ""
 
         self.log.debug("SpiffyTitles: requesting %s" % (api_url))
@@ -1292,10 +1271,7 @@ class SpiffyTitles(callbacks.Plugin):
             self.log.debug("SpiffyTitles: no title found.")
             return self.handler_default(url, channel)
 
-        agent = self.get_user_agent()
-        headers = {
-            "User-Agent": agent
-        }
+        headers = self.get_headers()
 
         self.log.debug("SpiffyTitles: requesting %s" % (data_url))
 
@@ -1621,14 +1597,30 @@ class SpiffyTitles(callbacks.Plugin):
                     else:
                         log.debug("SpiffyTitles: unacceptable mime type %s for url %s" %
                                   (content_type, url))
+                        suffixes = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB']
+                        def humansize(nbytes):
+                            i = 0
+                            while nbytes >= 1024 and i < len(suffixes)-1:
+                                nbytes /= 1024.
+                                i += 1
+                            f = ('%.2f' % nbytes).rstrip('0').rstrip('.')
+                            return '%s %s' % (f, suffixes[i])
+                        size = request.headers.get("content-length")
+                        if size:
+                            size = humansize(int(size))
+                            text = "[{0}] ({1})".format(content_type, size)
+                        else:
+                            text = "[{0}]".format(content_type)
+                        text = "<html><head><title>{0}</title></head><body></body></html>".format(text)
+                        return (text, is_redirect)
                 else:
                     log.error("SpiffyTitles HTTP response code %s" % (request.status_code,))
-                                                                           #request.content))
-                    return ('<html><head><title>Nice link idiot.</title></head><body></body></html>', is_redirect)
+                    text = self.registryValue("badLinkText")
+                    text = "<html><head><title>{0}</title></head><body></body></html>".format(text)
+                    return (text, is_redirect)
 
         except timeout_decorator.TimeoutError:
             log.error("SpiffyTitles: wall timeout!")
-
             self.get_source_by_url(url, retries + 1)
         except requests.exceptions.MissingSchema as e:
             url_wschema = "http://%s" % (url)
@@ -1640,11 +1632,9 @@ class SpiffyTitles(callbacks.Plugin):
                 return self.get_source_by_url(url_wschema)
         except requests.exceptions.Timeout as e:
             log.error("SpiffyTitles Timeout: %s" % (str(e)))
-
             self.get_source_by_url(url, retries + 1)
         except requests.exceptions.ConnectionError as e:
             log.error("SpiffyTitles ConnectionError: %s" % (str(e)))
-
             self.get_source_by_url(url, retries + 1)
         except requests.exceptions.HTTPError as e:
             log.error("SpiffyTitles HTTPError: %s" % (str(e)))
@@ -1674,9 +1664,8 @@ class SpiffyTitles(callbacks.Plugin):
         """
         Returns a random user agent from the ones available
         """
-        agents = self.registryValue("userAgents")
-
-        return random.choice(agents)
+        ua = UserAgent(fallback="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0")
+        return str(ua.random)
 
     def message_matches_ignore_pattern(self, input):
         """
diff --git a/SpiffyTitles/requirements.txt b/SpiffyTitles/requirements.txt
index c5d7956..8ed64c0 100644
--- a/SpiffyTitles/requirements.txt
+++ b/SpiffyTitles/requirements.txt
@@ -7,3 +7,4 @@ requests
 timeout-decorator
 certifi
 pendulum
+fake-useragent