Moving search to Qwant

2024-08-14 22:46:55 +00:00 · 2020-03-24 19:09:01 -04:00 · 2020-03-24 19:09:01 -04:00 · 391beaf422
commit 391beaf422
parent 19acae2abc
6 changed files with 422 additions and 229 deletions
--- a/extensions/core.py
+++ b/extensions/core.py
@ -267,6 +267,14 @@ Guild count: {len(self.bot.guilds)}
        # Message Sending
        await ctx.send(msg)
    @commands.command()
    @commands.is_owner()
    async def toggle_debug(self, ctx):
        """Toggles debug while running."""
        self.bot.debug_toggle = not self.bot.debug_toggle
        await ctx.send(f"Set debug mode to `{self.bot.debug_toggle}`.")
    @commands.command(aliases=['exit', 'reboot'])
    @commands.is_owner()
    async def restart(self, ctx):
--- a/extensions/models/SearchExceptions.py
+++ b/extensions/models/SearchExceptions.py
@ -0,0 +1,17 @@
 # -*- coding: utf-8 -*-
 # search exceptions
 # Provides custom exceptions for the search cog.
 """Search Exceptions File"""
 from discord.ext import commands
 class SafesearchFail(commands.CommandError):
    """Thrown when a query contains NSFW content."""
    pass
 def setup(bot):
    pass
--- a/extensions/search.py
+++ b/extensions/search.py
@ -10,6 +10,8 @@ from discord.ext import commands
 import aiohttp
 import random
 from typing import List
 from extensions.models import SearchExceptions
 import html2text
 class Search(commands.Cog, name="Basic"):
@ -21,193 +23,153 @@ class Search(commands.Cog, name="Basic"):
        self.bot = bot
        self.info = bot.logging.info
        self.warn = bot.logging.warn
        self.debug = bot.logging.debug
        self.request = bot.request
        self.emoji = "\U0001F50D"
        self.scrape_token = bot.config['SCRAPESTACK']
-        # Get Instances
+        # Markdown converter
-        with open('searxes.txt') as f:
+        self.tomd = html2text.HTML2Text()
-            self.instances = f.read().split('\n')
+        self.tomd.ignore_links = True
        self.tomd.ignore_images = True
        self.tomd.ignore_tables = True
        self.tomd.ignore_emphasis = True
        self.tomd.body_width = 0
    async def _search_logic(self, query: str, is_nsfw: bool = False,
-                            category: str = None) -> str:
+                            category: str = 'web', count: int = 5) -> list:
-        """Provides search logic for all search commands."""
+        """Uses scrapestack and the Qwant API to find search results."""
        # Typing
        base: str
        safesearch: str
        # NSFW Filtering
-        # WARNING - This list includes slurs.
+        # WARNING This list includes slurs.
        nono_words = [
-            'tranny', 'faggot', 'fag',
+            'tranny', 'faggot', 'fag', 'porn', 'cock', 'dick',
-            'porn', 'cock', 'dick',
+            'titty', ' tit ', 'boob', 'penis', 'slut', ' cum ', 'jizz',
-            'titty', 'boob', 'penis',
+            'semen', 'cooch', 'coochie', 'pussy', 'penis', 'fetish',
-            'slut', 'cum', 'jizz',
+            'bdsm', 'sexy', 'xxx', 'orgasm', 'masturbat',
-            'semen', 'cooch', 'coochie',
+            'erotic', 'creampie', 'fap', 'nude', 'orgasm',
-            'pussy', 'penis', 'fetish',
+            'squirting', 'yiff', 'e621', ' sex', 'ejaculat',
-            'bdsm', 'sexy', 'xxx',
+            'cunt', 'vagina', 'coom', 'troon', 'hentai', 'yaoi',
-            'orgasm', 'masturbation',
+            'bukkake', 'bara', 'shota', 'loli', 'fetish', 'spunk',
-            'erotic', 'creampie',
+            'pron', 'p0rn', 'pr0n', 'gloryhole', 'felch', 'skullfuck',
-            'fap', 'nude', 'orgasm',
+            'scat', 'pissplay', 'piss play', 'underage', 'bbw',
-            'squirting', 'yiff',
+            'fisting', 'queef', "rimming", 'rimjob', 'bdsm',
-            'e621'
+            'cbt', 'blumpkin', 'boner', 'prostitut', 'butt plug',
-        ]
+            'transvestite', 'femboy', 'castrat', 'philia', 'edging',
-        nono_sites = [
+            'edgeplay', 'enema', 'facial', 'fellat', 'femdom', 'footjob',
-            'xvideos', 'pornhub',
+            'blowjob', 'titjob', 'handjob', 'frot', 'gang bang', 'gangbang',
-            'xhamster', 'xnxx',
+            'glory hole', 'hermap', 'jerk off', 'jerking off', 'jack off',
-            'youporn', 'xxx',
+            'jacking off', 'kink', 'wet dream', 'anal', 'pegging', 'precum',
-            'freexcafe', 'sex.com',
+            'pre-cum', 'pre cum', 'priap', 'scrotum', 'shemale', 'smegma',
-            'e621'
+            'smut', 'softcore', 'transsexual', 'voyeur', 'viagra', 'wank',
            'whore'
        ]
-        if not is_nsfw:
+        if any(n in query for n in nono_words):
-            for i in nono_words:
+            raise SearchExceptions.SafesearchFail('Query had NSFW.')
                if i in query.replace(" ", ""):
                    return (
                        "**Sorry!** That query included language "
                        "we cannot accept in a non-NSFW channel. "
                        "Please try again in an NSFW channel."
                    )
-        # Choose an instance
+        # Scrape or not
-        if self.instances == []:
+        # if self.scrape_token != '':
-            with open('searxes.txt') as f:
+        #     base = (
-                self.instances = f.read().split('\n')
+        #         "http://api.scrapestack.com/scrape"
-        instance = random.sample(self.instances, k=1)[0]
+        #         f"?access_key={self.scrape_token}"
-
+        #         f"&url=https://api.qwant.com/api"
-        # Error Template
+        #     )
-        error_msg = (
+        #     print(base)
-            "**An error occured!**\n\n"
+        # else:
-            f"There was a problem with `{instance}`. Please try again later.\n"
+        base = "https://api.qwant.com/api"
            f"_If problems with this instance persist, contact`{self.bot.appinfo.owner}` to have it removed._"
        )
        # Create the URL to make an API call to
        call = f'{instance}search?q={query}&format=json&language=en-US'
        # If a type is provided, add that type to the call URL
        if category:
            call += f'&categories={category}'
        # Safesearch
        if is_nsfw:
-            call += '&safesearch=0'
+            safesearch = "0"
        else:
-            call += '&safesearch=1'
+            safesearch = "2"
-        # Figure out engines for different categories to get decent results.
+        # Search URL Building
-        if category == 'videos':
+        # api.qwant.com/api/search/web?count=5&q=test&safesearch=2&...
-            call += '&engines=bing+videos,google+videos'
+        search_url = (
-        # Make said API call
+            f"{base}/search/{category}"
-        try:
+            f"?count={count}"
-            async with self.request.get(call) as resp:
+            f"&q={query}"
-                response = await resp.json()
+            f"&safesearch={safesearch}"
-        except aiohttp.ClientError:
+            "&t=web"
-            return error_msg
+            "&locale=en_US"
-
+            "&uiv=4"
        # Split our response data up for parsing
        # infoboxes = response['infoboxes']
        results = response['results']
        # Create message with results
        try:
            # Handle tiny result count
            if len(results) > 5:
                amt = 5
            else:
                amt = len(results)
            # Remove no-no sites
            if not is_nsfw:
                for r in results[0:7]:
                    for n in nono_sites:
                        if n in r['url']:
                            results.remove(r)
            # Escape stuff
            query = discord.utils.escape_mentions(query)
            query = discord.utils.escape_markdown(query)
            # Header
            msg = f"Showing **{amt}** results for `{query}`. \n\n"
            # Expanded Result
            msg += (
                f"**{results[0]['title']}** <{results[0]['url']}>\n"
                f"{results[0]['content']}\n\n")
            # Other Results
            msg += "\n".join(
                [f"**{entry['title']}** <{entry['url']}>" for entry in results[1:5]])
            # Instance Info
            msg += f"\n\n_Results retrieved from instance `{instance}`._"
            return msg
        # Reached if error with returned results
        except (KeyError, IndexError) as e:
            # Logging
            await self.warn(
                f"A user encountered a(n) `{e}` with <{instance}> when searching for `{query}`. "
                "Consider removing it or looking into it.",
                name="Failed Instance"
        )
        await self.debug(search_url, name="_search_logic")
-            self.instances.remove(instance)  # Weed the instance out
+        # Searching
-            # Recurse until good response
+        headers = {
-            return await self._search_logic(query, is_nsfw)
+            'User-Agent': (
                'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:74.0)'
                ' Gecko/20100101 Firefox/74.0'
            )
        }
        async with self.request.get(search_url, headers=headers) as resp:
            to_parse = await resp.json()
-    async def _instance_check(self, instance: str, content: dict) -> bool:
+            # Sends results
-        """Checks the quality of an instance."""
+            return to_parse['data']['result']['items']
-        # Makes sure proper values exist
+    async def _basic_search(self, ctx, query: str, category: str = 'web'):
-        if 'error' in content:
+        """Basic search formatting."""
            return False
        if not ('engines' in content and 'initial' in content['timing']):
            return False
        if not ('google' in content['engines'] and 'enabled' in content['engines']['google']):
            return False
-        # Makes sure google is enabled
+        # NOTE Customizable count not yet implemented.
-        if not content['engines']['google']['enabled']:
+        count: int = 5
            return False
-        # Makes sure is not Tor
+        # Safesearch variable
        if content['network_type'] != 'normal':
            return False
        # Only picks instances that are fast enough
        timing = int(content['timing']['initial'])
        if timing > 0.20:
            return False
        # Check for Google captcha
        test_search = f'{instance}/search?q=test&format=json&lang=en-US'
        try:
            async with self.request.get(test_search) as resp:
                response = await resp.json()
            response['results'][0]['content']
        except (aiohttp.ClientError, KeyError, IndexError):
            return False
        # Reached if passes all checks
        return True
    async def _basic_search(self, ctx, query: str,
                            category: str = None):
        """Base search message generation."""
        async with ctx.typing():
        is_nsfw = (
            ctx.channel.is_nsfw() if hasattr(ctx.channel, 'is_nsfw')
            else False
        )
-            msg = await self._search_logic(query, is_nsfw, category)
+        # Handling
-            await ctx.send(msg)
+        async with ctx.typing():
-            await self.info(
+            # Searches
-                content=(
+            results = await self._search_logic(query, is_nsfw, category)
-                    f"**{ctx.author}** searched for `{query}` "
+            count = len(results)
-                    f"in \"{ctx.guild}\" and got this:"
+
-                    f"\n\n{msg}"
+            # Escapes all nasties for displaying
-                ),
+            query_display = discord.utils.escape_mentions(query)
-                name="Search Results"
+            query_display = discord.utils.escape_markdown(query_display)
            # Return if no results
            try:
                results[0]
            except IndexError:
                return await ctx.send(
                    f"No results found for `{query_display}`."
                )
            # Gets the first entry's stuff
            first_title = self.tomd.handle(results[0]['title']).rstrip('\n')
            first_url = results[0]['url']
            first_desc = self.tomd.handle(results[0]['desc']).rstrip('\n')
            # Builds the substring for each of the other results.
            other_results: List[str] = []
            for r in results[1:count]:
                title = self.tomd.handle(r['title']).rstrip('\n')
                url = results[0]['url']
                other_results.append(f"**{title}** <{url}>")
            other_msg: str = "\n".join(other_results)
            # Builds message
            msg = (
                f"Showing **{count}** results for `{query_display}`.\n\n"
                f"**{first_title}** <{first_url}>\n{first_desc}\n\n"
                f"{other_msg}\n\n_Powered by Qwant._"
            )
            # Sends message
            await self.debug(msg, name="_basic_search")
            await ctx.send(msg)
    @commands.command()
    async def search(self, ctx, *, query: str):
        """Search online for general results."""
@ -250,51 +212,27 @@ class Search(commands.Cog, name="Basic"):
        await self._basic_search(ctx, query, 'maps')
    @commands.command()
    @commands.is_owner()
    async def rejson(self, ctx):
        """Refreshes the list of instances for searx."""
        msg = await ctx.send('<a:updating:403035325242540032> Refreshing instance list...\n\n'
                             '(Due to extensive quality checks, this may take a bit.)')
        plausible: List[str] = []
        # Get, parse, and quality check all instances
        async with self.request.get('https://searx.space/data/instances.json') as r:
            # Parsing
            searx_json = await r.json()
            instances = searx_json['instances']
            # Quality Check
            for i in instances:
                content = instances.get(i)
                is_good: bool = await self._instance_check(i, content)
                if is_good:
                    plausible.append(i)
        # Save new list
        self.instances = plausible
        with open('searxes.txt', 'w') as f:
            f.write('\n'.join(plausible))
        await msg.edit(content='Instances refreshed!')
    @commands.Cog.listener()
    async def on_command_error(self, ctx, error):
        """Listener makes no command fallback to searching."""
-        if isinstance(error, commands.CommandNotFound) or \
+        fallback = (commands.CommandNotFound, commands.CheckFailure)
                isinstance(error, commands.CheckFailure):
-            # Handling
+        if isinstance(error, fallback):
-            async with ctx.typing():
+            try:
-                # Prepares term
+                await self._basic_search(
-                term = ctx.message.content.replace(ctx.prefix, '', 1)
+                    ctx, ctx.message.content[len(ctx.prefix):]
-                term = term.lstrip(' ')
+                )
-
+            except SearchExceptions.SafesearchFail:
-                # Does search
+                await ctx.send(
-                await self._basic_search(ctx, term)
+                    "**Sorry!** That query included language "
                    "we cannot accept in a non-NSFW channel. "
                    "Please try again in an NSFW channel."
                )
            except Exception as e:
                print(e)
 def setup(bot):
    bot.add_cog(Search(bot))
--- a/extensions/utils/logging.py
+++ b/extensions/utils/logging.py
@ -19,6 +19,7 @@ class Logging():
        self.request = bot.request
        self.online = bot.online
        self.maintenance = bot.maintenance
        self.debug_toggle = bot.debug_toggle
        # Sets info hook first
        self.info_hook = self.online.get_webhook(
@ -26,7 +27,6 @@ class Logging():
            if bot.config['HOOKS']['INFO_HOOK'] \
            else None
        # Sets other hooks or defaults them
        if self.info_hook:
            self.warn_hook = self.online.get_webhook(
@ -51,7 +51,7 @@ class Logging():
        # Prerequisites
        formatted_tb = traceback.format_tb(error.__traceback__)
-        formatted_tb = ''.join(formatted_tb)
+        tb_str = ''.join(formatted_tb)
        original_exc = traceback.format_exception(
            type(error), error, error.__traceback__)
@ -77,7 +77,7 @@ class Logging():
        trace_content = (
            "```py\n\nTraceback (most recent call last):"
            "\n{}{}: {}```").format(
-                formatted_tb,
+                tb_str,
                type(error).__name__,
                error)
@ -120,7 +120,9 @@ class Logging():
        if self.warn_hook:
            return await self.warn_hook.send(
                content=content,
-                username=f"{self.bot.user.name} - {name if name else 'unknown'}",
+                username=(
                    f"{self.bot.user.name} - {name if name else 'unknown'}"
                ),
                avatar_url=str(self.bot.user.avatar_url),
                embed=embed
            )
@ -144,7 +146,9 @@ class Logging():
            )
            await self.error_hook.send(
                content=fallback,
-                username=f"{self.bot.user.name} - {name if name else 'unknown'}",
+                username=(
                    f"{self.bot.user.name} - {name if name else 'unknown'}"
                ),
                avatar_url=str(self.bot.user.avatar_url),
                embed=error_embed
            )
@ -168,10 +172,12 @@ class Logging():
                    name: Optional[str] = None):
        """Logs warnings and sends them to the appropriate places."""
-        if self.debug_hook and self.maintenance:
+        if self.debug_hook and (self.maintenance or self.debug_toggle):
            return await self.debug_hook.send(
-                content=content,
+                content=f"```{content}```",
-                username=f"{self.bot.user.name} - {name if name else 'unknown'}",
+                username=(
                    f"{self.bot.user.name} - {name if name else 'unknown'}"
                ),
                avatar_url=str(self.bot.user.avatar_url),
                embed=embed
            )
--- a/main.py
+++ b/main.py
@ -9,14 +9,13 @@
 import discord
 from discord.ext import commands
 import traceback
 import json
 import os
 import sys
 import asyncio
 import aiohttp
 import rethinkdb
-from typing import List, Optional
+from typing import List
 from extensions.models import SearchExceptions
 class Bot(commands.Bot):
@ -30,6 +29,7 @@ class Bot(commands.Bot):
        # Setup
        self.extensions_list: List[str] = []
        self.debug_toggle = False
        with open('config.json') as f:
            self.config = json.load(f)
@ -249,13 +249,26 @@ async def on_command_error(ctx, error):
    """Handles all errors stemming from ext.commands."""
    # Lets other cogs handle CommandNotFound.
-    # Change this if you want command not found handling
+    # Change this if you want command not found handling.
    if (
        isinstance(error, commands.CommandNotFound)
        or isinstance(error, commands.CheckFailure)
    ):
        return
    # Custom message for if an argument is missing.
    elif isinstance(error, commands.MissingRequiredArgument):
        await ctx.send(
            f"**Missing Argument!** A `{error.param.name}` is needed."
        )
    elif isinstance(error, SearchExceptions.SafesearchFail):
        await ctx.send(
            "**Sorry!** That query included language "
            "we cannot accept in a non-NSFW channel. "
            "Please try again in an NSFW channel."
        )
    # Provides a very pretty embed if something's actually a dev's fault.
    elif isinstance(error, commands.CommandInvokeError):
--- a/old_search.py
+++ b/old_search.py
@ -0,0 +1,211 @@
 # This is the old search logic for reference purposes 
    async def _old_search_logic(self, query: str, is_nsfw: bool = False,
                                category: str = None) -> str:
        """Provides search logic for all search commands."""
        # NSFW Filtering
        # WARNING - This list includes slurs.
        nono_words = [
            'tranny', 'faggot', 'fag',
            'porn', 'cock', 'dick',
            'titty', 'boob', 'penis',
            'slut', 'cum', 'jizz',
            'semen', 'cooch', 'coochie',
            'pussy', 'penis', 'fetish',
            'bdsm', 'sexy', 'xxx',
            'orgasm', 'masturbation',
            'erotic', 'creampie',
            'fap', 'nude', 'orgasm',
            'squirting', 'yiff',
            'e621'
        ]
        nono_sites = [
            'xvideos', 'pornhub',
            'xhamster', 'xnxx',
            'youporn', 'xxx',
            'freexcafe', 'sex.com',
            'e621', 'nhentai'
        ]
        if not is_nsfw:
            for i in nono_words:
                if i in query.replace(" ", ""):
                    return (
                        "**Sorry!** That query included language "
                        "we cannot accept in a non-NSFW channel. "
                        "Please try again in an NSFW channel."
                    )
        # Choose an instance
        if self.instances == []:
            with open('searxes.txt') as f:
                self.instances = f.read().split('\n')
        instance = random.sample(self.instances, k=1)[0]
        # Error Template
        error_msg = (
            "**An error occured!**\n\n"
            f"There was a problem with `{instance}`. Please try again later.\n"
            f"_If problems with this instance persist, "
            f"contact`{self.bot.appinfo.owner}` to have it removed._"
        )
        # Create the URL to make an API call to
        call = f'{instance}search?q={query}&format=json&language=en-US'
        # If a type is provided, add that type to the call URL
        if category:
            call += f'&categories={category}'
        if is_nsfw:
            call += '&safesearch=0'
        else:
            call += '&safesearch=1'
        # Figure out engines for different categories to get decent results.
        if category == 'videos':
            call += '&engines=bing+videos,google+videos'
        # Make said API call
        try:
            async with self.request.get(call) as resp:
                response = await resp.json()
        except aiohttp.ClientError:
            return error_msg
        # Split our response data up for parsing
        # infoboxes = response['infoboxes']
        results = response['results']
        # Create message with results
        try:
            # Handle tiny result count
            if len(results) > 5:
                amt = 5
            else:
                amt = len(results)
            # Remove no-no sites
            if not is_nsfw:
                for r in results[0:7]:
                    for n in nono_sites:
                        if n in r['url']:
                            results.remove(r)
            # Escape stuff
            query = discord.utils.escape_mentions(query)
            query = discord.utils.escape_markdown(query)
            # Header
            msg = f"Showing **{amt}** results for `{query}`. \n\n"
            # Expanded Result
            msg += (
                f"**{results[0]['title']}** <{results[0]['url']}>\n"
                f"{results[0]['content']}\n\n")
            # Other Results
            msg += "\n".join(
                [f"**{entry['title']}** <{entry['url']}>" for entry in results[1:5]])
            # Instance Info
            msg += f"\n\n_Results retrieved from instance `{instance}`._"
            return msg
        # Reached if error with returned results
        except (KeyError, IndexError) as e:
            # Logging
            await self.warn(
                f"A user encountered a(n) `{e}` with <{instance}> when searching for `{query}`. "
                "Consider removing it or looking into it.",
                name="Failed Instance"
            )
            self.instances.remove(instance)  # Weed the instance out
            # Recurse until good response
            return await self._old_search_logic(query, is_nsfw)
    async def _instance_check(self, instance: str, content: dict) -> bool:
        """Checks the quality of an instance."""
        # Makes sure proper values exist
        if 'error' in content:
            return False
        if not ('engines' in content and 'initial' in content['timing']):
            return False
        if not ('google' in content['engines'] and 'enabled' in content['engines']['google']):
            return False
        # Makes sure google is enabled
        if not content['engines']['google']['enabled']:
            return False
        # Makes sure is not Tor
        if content['network_type'] != 'normal':
            return False
        # Only picks instances that are fast enough
        timing = int(content['timing']['initial'])
        if timing > 0.20:
            return False
        # Check for Google captcha
        test_search = f'{instance}/search?q=test&format=json&lang=en-US'
        try:
            async with self.request.get(test_search) as resp:
                response = await resp.json()
            response['results'][0]['content']
        except (aiohttp.ClientError, KeyError, IndexError):
            return False
        # Reached if passes all checks
        return True
            @commands.command()
    @commands.is_owner()
    async def rejson(self, ctx):
        """Refreshes the list of instances for searx."""
        msg = await ctx.send('<a:updating:403035325242540032> Refreshing instance list...\n\n'
                             '(Due to extensive quality checks, this may take a bit.)')
        plausible: List[str] = []
        # Get, parse, and quality check all instances
        async with self.request.get('https://searx.space/data/instances.json') as r:
            # Parsing
            searx_json = await r.json()
            instances = searx_json['instances']
            # Quality Check
            for i in instances:
                content = instances.get(i)
                is_good: bool = await self._instance_check(i, content)
                if is_good:
                    plausible.append(i)
        # Save new list
        self.instances = plausible
        with open('searxes.txt', 'w') as f:
            f.write('\n'.join(plausible))
        await msg.edit(content='Instances refreshed!')
            async def _old_basic_search(self, ctx, query: str,
                                category: str = None):
        """Base search message generation."""
        async with ctx.typing():
            is_nsfw = (
                ctx.channel.is_nsfw() if hasattr(ctx.channel, 'is_nsfw')
                else False
            )
            msg = await self._old_search_logic(query, is_nsfw, category)
            await ctx.send(msg)
            await self.info(
                content=(
                    f"**{ctx.author}** searched for `{query}` "
                    f"in \"{ctx.guild}\" and got this:"
                    f"\n\n{msg}"
                ),
                name="Search Results"
            )