Initial work on pagination

2020-04-07 17:40:42 -04:00 · 2020-04-07 17:40:42 -04:00 · cbea2c6440
parent c64e52b865
commit cbea2c6440
4 changed files with 442 additions and 194 deletions
--- a/extensions/models/search_source.py
+++ b/extensions/models/search_source.py
@ -0,0 +1,176 @@
 # -*- coding: utf-8 -*-
 # search source
 # Provides paginator sources for the search cog.
 """Search Source File"""
 from typing import Callable, List, Tuple, Optional, Any
 import discord
 from discord.ext import menus
 import html2text
 import re
 FetcherArgs = Tuple[Any]
 Fetcher = Callable[..., List]
 # Markdown converter
 tomd = html2text.HTML2Text()
 tomd.ignore_links = True
 tomd.ignore_images = True
 tomd.ignore_tables = True
 tomd.ignore_emphasis = True
 tomd.body_width = 0
 # TODO Change around value names, make it general
 class Result:
    """A class that holds the general data for a search result.
    Parameters:
    title (str): Title of the content.
    url (str): The direct link to the content.
    desc (str): The content's description.
    source (Optional[str]): The source site. Defaults to url. 
    image (Optional[str]): The content's image.
    """
    def __init__(self, title: str, url: str,
                 desc: str = "No description provided.",
                 source: Optional[str] = None,  image: Optional[str] = None):
        self.url = url
        if title in [None, ""]:
            self.title = "Unknown"
        else:
            self.title = title
        self.desc = desc
        self.source = source
        self.image = image
    def __repr__(self):
        fmt = f'<Image url={self.url} title={self.title} source={self.source}>'
        return fmt
 class NormalSource(menus.AsyncIteratorPageSource):
    def __init__(self, query: str, fetcher: FetcherArgs, per_page: int,
                 header: str = "", footer: str = ""):
        self.header = header
        self.footer = footer
        self.query = query
        super().__init__(self._generate(fetcher), per_page=per_page)
    async def _generate(self, fetcher: Fetcher):
        offset = 0
        per_request = 10
        # TODO put the generation in the fetcher itself
        # Qwant: image - media, source - url, title - title
        while results := await fetcher(
                offset, per_request, self.query
            ):
            results
            for r in results:
                yield r
            offset += per_request
    async def format_page(self, menu, entries):
        start = menu.current_page * self.per_page
        # Escapes all nasties for displaying
        query_display = discord.utils.escape_mentions(self.query)
        query_display = discord.utils.escape_markdown(query_display)
        # Return if no results
        try:
            entries[0]
        except IndexError:
            return f"No results found for `{query_display}`."
        # Gets the first entry's stuff
        first_title = tomd.handle(entries[0].title).rstrip('\n')
        first_url = entries[0].url
        if start == 0:
            first_desc = tomd.handle(entries[0].desc).rstrip('\n')
            first = f"**{first_title}** {first_url}\n{first_desc}\n\n"
        else:
            first =  f"**{first_title}** {first_url}\n"
        # Builds the substring for each of the other results.
        other_results: List[str] = []
        for e in entries[1:5]:
            title = tomd.handle(e.title).rstrip('\n')
            url = e.url
            other_results.append(f"**{title}** {url}")
        other_msg = "\n".join(other_results)
        # Builds message
        msg = f"{first}{other_msg}"
        msg = re.sub(
            r'(https?://(?:www\.)?[-a-zA-Z0-9@:%._+~#=]+\.'
            r'[a-zA-Z0-9()]+\b[-a-zA-Z0-9()@:%_+.~#?&/=]*)',
            r'<\1>',
            msg
        )
        content = (
            f"{self.header}\n\n"
            f"Showing results *{start} - {start + 5}* "
            f"for `{self.query}`.\n\n"
            f"{msg}\n\n"
            f"{self.footer}"
        )
        return content
 class ImageSource(menus.AsyncIteratorPageSource):
    def __init__(self, query: str, fetcher: FetcherArgs, args: FetcherArgs,
                 header: str = "", footer: str = ""):
        self.header = header
        self.footer = footer
        self.query = query
        super().__init__(self._generate(fetcher, args), per_page=1)
    async def _generate(self, fetcher: Fetcher, fetch_args: FetcherArgs):
        offset = 0
        per_request = 10
        # TODO put the generation in the fetcher itself
        # Qwant: image - media, source - url, title - title
        while results := await fetcher(
                offset, per_request, self.query, *fetch_args
            ):
            results
            for r in results:
                yield r
            offset += per_request
    async def format_page(self, menu, entry):
        start = menu.current_page * self.per_page
        content = (
            f"{self.header}\n\n"
            f"Showing image result `{start}` for `{self.query}``.\n\n"
            f"<{entry.image}>"
            f"{self.footer}\n\n"
        )
        embed = discord.Embed(
            title=entry.title,
            url=entry.image,
            description=entry.source
        )
        embed.set_image(url=entry.image)
        return {
            "content": content,
            "embed": embed
        }
--- a/extensions/search.py
+++ b/extensions/search.py
@ -8,12 +8,13 @@
 from typing import List
 import discord
-from discord.ext import commands
+from discord.ext import commands, menus
 import html2text
 import re
 from urllib.parse import quote_plus
 from extensions.models import SearchExceptions
 from extensions.models.search_source import Result, NormalSource, ImageSource
 class Search(commands.Cog, name="Basic"):
@ -39,7 +40,8 @@ class Search(commands.Cog, name="Basic"):
        self.tomd.body_width = 0
    async def _search_logic(self, query: str, is_nsfw: bool = False,
-                            category: str = 'web', count: int = 5) -> list:
+                            category: str = 'web', count: int = 5,
                            offset: int = 0) -> list:
        """Uses scrapestack and the Qwant API to find search results."""
        # Typing
@ -87,9 +89,12 @@ class Search(commands.Cog, name="Basic"):
        search_url = (
            f"{base}/search/{category}"
            f"?count={count}"
            f"&offset={offset}"
            f"&q={query}"
            f"&safesearch={safesearch}"
-            "&t=web"
+            f"&t={category}"
            "&extensionDisabled=true"
            "&device=tablet"
            "&locale=en_US"
            "&uiv=4"
        )
@ -113,11 +118,47 @@ class Search(commands.Cog, name="Basic"):
        }
        async with self.request.get(search_url, headers=headers) as resp:
            to_parse = await resp.json()
            print(to_parse)
            # Sends results
            return to_parse['data']['result']['items']
    async def _page_search(self, ctx, query: str, count:int = 5,
                           category: str = 'web'):
        """Basic search formatting - this time with pages!"""
        is_nsfw = (
            ctx.channel.is_nsfw() if hasattr(ctx.channel, 'is_nsfw')
            else False
        )
        async def fetcher(offset, per_request, q):
            result_objects = []
            results = await self._search_logic(
                query, is_nsfw, category, per_request, offset
            )
            for r in results:
                result = Result(
                    title=r["title"],
                    url=r["url"],
                    desc=r["desc"],
                    source=r["source"]
                )
                result_objects.append(result)
            return result_objects
        pages = menus.MenuPages(
            source=NormalSource(
                query, fetcher, count,
                footer="_Powered by Qwant._"
            ),
            clear_reactions_after=True,
        )
        await pages.start(ctx)
    async def _basic_search(self, ctx, query: str, category: str = 'web'):
        """Basic search formatting."""
@ -169,8 +210,6 @@ class Search(commands.Cog, name="Basic"):
                f"{other_msg}\n\n_Powered by Qwant._"
            )
            print(msg)
            msg = re.sub(
                r'(https?://(?:www\.)?[-a-zA-Z0-9@:%._+~#=]+\.'
                r'[a-zA-Z0-9()]+\b[-a-zA-Z0-9()@:%_+.~#?&/=]*)',
@ -178,7 +217,6 @@ class Search(commands.Cog, name="Basic"):
                msg
            )
            # Sends message
            await self.info(
                f"**New Search** - `{ctx.author}` in `{ctx.guild}`\n\n{msg}",
@ -186,6 +224,32 @@ class Search(commands.Cog, name="Basic"):
            )
            await ctx.send(msg)
    @commands.command()
    async def paginated_search(self, ctx, *, query: str):
        async with ctx.typing():
            await self._page_search(ctx, query)
        # async def fetcher(offset, per_request, q, *args):
        #     result_objects = []
        #     results = await self._search_logic(
        #         q, False, "images", per_request, offset)
        #     for r in results:
        #         image = Result(
        #             title=r["title"],
        #             url=r["media"],
        #             source=r["url"],
        #             image=r["media"]
        #         )
        #         result_objects.append(image)
        #     return result_objects
        # pages = menus.MenuPages(
        #     source=ImageSource(query,  fetcher, (None,)),
        #     clear_reactions_after=True)
        # await pages.start(ctx)
    @commands.command()
    async def search(self, ctx, *, query: str):
        """Search online for general results."""
--- a/main.py
+++ b/main.py
@ -5,7 +5,8 @@
 # Used and modified with permission.
 # See LICENSE for license information.
-'''Main File'''
+
 """Main File"""
 import json
 import os
@ -15,13 +16,14 @@ from typing import List
 import aiohttp
 import discord
 from discord.ext import commands
 from discord.ext.menus import CannotAddReactions
 import rethinkdb
 from extensions.models import SearchExceptions
 class Bot(commands.Bot):
-    """Custom Bot Class that subclasses the commands.ext one"""
+    """Custom Bot Class that subclasses the commands.ext one."""
    def __init__(self, **options):
        """Initializes the main parts of the bot."""
@ -271,6 +273,9 @@ async def on_command_error(ctx, error):
            "Please try again in an NSFW channel."
        )
    elif isinstance(error, CannotAddReactions):
        await ctx.send("**I cannot add reactions for pagination here!**")
    # Provides a very pretty embed if something's actually a dev's fault.
    elif isinstance(error, commands.CommandInvokeError):
--- a/old_search.py
+++ b/old_search.py
@ -1,211 +1,214 @@
-# This is the old search logic for reference purposes 
+# This is the old search logic for reference purposes
    async def _old_search_logic(self, query: str, is_nsfw: bool = False,
                                category: str = None) -> str:
        """Provides search logic for all search commands."""
        # NSFW Filtering
        # WARNING - This list includes slurs.
        nono_words = [
            'tranny', 'faggot', 'fag',
            'porn', 'cock', 'dick',
            'titty', 'boob', 'penis',
            'slut', 'cum', 'jizz',
            'semen', 'cooch', 'coochie',
            'pussy', 'penis', 'fetish',
            'bdsm', 'sexy', 'xxx',
            'orgasm', 'masturbation',
            'erotic', 'creampie',
            'fap', 'nude', 'orgasm',
            'squirting', 'yiff',
            'e621'
        ]
        nono_sites = [
            'xvideos', 'pornhub',
            'xhamster', 'xnxx',
            'youporn', 'xxx',
            'freexcafe', 'sex.com',
            'e621', 'nhentai'
        ]
 async def _old_search_logic(self, query: str, is_nsfw: bool = False,
                            category: str = None) -> str:
    """Provides search logic for all search commands."""
    # NSFW Filtering
    # WARNING - This list includes slurs.
    nono_words = [
        'tranny', 'faggot', 'fag',
        'porn', 'cock', 'dick',
        'titty', 'boob', 'penis',
        'slut', 'cum', 'jizz',
        'semen', 'cooch', 'coochie',
        'pussy', 'penis', 'fetish',
        'bdsm', 'sexy', 'xxx',
        'orgasm', 'masturbation',
        'erotic', 'creampie',
        'fap', 'nude', 'orgasm',
        'squirting', 'yiff',
        'e621'
    ]
    nono_sites = [
        'xvideos', 'pornhub',
        'xhamster', 'xnxx',
        'youporn', 'xxx',
        'freexcafe', 'sex.com',
        'e621', 'nhentai'
    ]
    if not is_nsfw:
        for i in nono_words:
            if i in query.replace(" ", ""):
                return (
                    "**Sorry!** That query included language "
                    "we cannot accept in a non-NSFW channel. "
                    "Please try again in an NSFW channel."
                )
    # Choose an instance
    if self.instances == []:
        with open('searxes.txt') as f:
            self.instances = f.read().split('\n')
    instance = random.sample(self.instances, k=1)[0]
    # Error Template
    error_msg = (
        "**An error occured!**\n\n"
        f"There was a problem with `{instance}`. Please try again later.\n"
        f"_If problems with this instance persist, "
        f"contact`{self.bot.appinfo.owner}` to have it removed._"
    )
    # Create the URL to make an API call to
    call = f'{instance}search?q={query}&format=json&language=en-US'
    # If a type is provided, add that type to the call URL
    if category:
        call += f'&categories={category}'
    if is_nsfw:
        call += '&safesearch=0'
    else:
        call += '&safesearch=1'
    # Figure out engines for different categories to get decent results.
    if category == 'videos':
        call += '&engines=bing+videos,google+videos'
    # Make said API call
    try:
        async with self.request.get(call) as resp:
            response = await resp.json()
    except aiohttp.ClientError:
        return error_msg
    # Split our response data up for parsing
    # infoboxes = response['infoboxes']
    results = response['results']
    # Create message with results
    try:
        # Handle tiny result count
        if len(results) > 5:
            amt = 5
        else:
            amt = len(results)
        # Remove no-no sites
        if not is_nsfw:
-            for i in nono_words:
+            for r in results[0:7]:
-                if i in query.replace(" ", ""):
+                for n in nono_sites:
-                    return (
+                    if n in r['url']:
-                        "**Sorry!** That query included language "
+                        results.remove(r)
                        "we cannot accept in a non-NSFW channel. "
                        "Please try again in an NSFW channel."
                    )
-        # Choose an instance
+        # Escape stuff
-        if self.instances == []:
+        query = discord.utils.escape_mentions(query)
-            with open('searxes.txt') as f:
+        query = discord.utils.escape_markdown(query)
                self.instances = f.read().split('\n')
        instance = random.sample(self.instances, k=1)[0]
-        # Error Template
+        # Header
-        error_msg = (
+        msg = f"Showing **{amt}** results for `{query}`. \n\n"
-            "**An error occured!**\n\n"
+        # Expanded Result
-            f"There was a problem with `{instance}`. Please try again later.\n"
+        msg += (
-            f"_If problems with this instance persist, "
+            f"**{results[0]['title']}** <{results[0]['url']}>\n"
-            f"contact`{self.bot.appinfo.owner}` to have it removed._"
+            f"{results[0]['content']}\n\n")
        # Other Results
        msg += "\n".join(
            [f"**{entry['title']}** <{entry['url']}>" for entry in results[1:5]])
        # Instance Info
        msg += f"\n\n_Results retrieved from instance `{instance}`._"
        return msg
    # Reached if error with returned results
    except (KeyError, IndexError) as e:
        # Logging
        await self.warn(
            f"A user encountered a(n) `{e}` with <{instance}> when searching for `{query}`. "
            "Consider removing it or looking into it.",
            name="Failed Instance"
        )
-        # Create the URL to make an API call to
+        self.instances.remove(instance)  # Weed the instance out
-        call = f'{instance}search?q={query}&format=json&language=en-US'
+        # Recurse until good response
        return await self._old_search_logic(query, is_nsfw)
        # If a type is provided, add that type to the call URL
        if category:
            call += f'&categories={category}'
-        if is_nsfw:
+async def _instance_check(self, instance: str, content: dict) -> bool:
-            call += '&safesearch=0'
+    """Checks the quality of an instance."""
        else:
            call += '&safesearch=1'
-        # Figure out engines for different categories to get decent results.
+    # Makes sure proper values exist
-        if category == 'videos':
+    if 'error' in content:
-            call += '&engines=bing+videos,google+videos'
+        return False
-        # Make said API call
+    if not ('engines' in content and 'initial' in content['timing']):
-        try:
+        return False
-            async with self.request.get(call) as resp:
+    if not ('google' in content['engines'] and 'enabled' in content['engines']['google']):
-                response = await resp.json()
+        return False
        except aiohttp.ClientError:
            return error_msg
-        # Split our response data up for parsing
+    # Makes sure google is enabled
-        # infoboxes = response['infoboxes']
+    if not content['engines']['google']['enabled']:
-        results = response['results']
+        return False
-        # Create message with results
+    # Makes sure is not Tor
-        try:
+    if content['network_type'] != 'normal':
-            # Handle tiny result count
+        return False
            if len(results) > 5:
                amt = 5
            else:
                amt = len(results)
-            # Remove no-no sites
+    # Only picks instances that are fast enough
-            if not is_nsfw:
+    timing = int(content['timing']['initial'])
-                for r in results[0:7]:
+    if timing > 0.20:
-                    for n in nono_sites:
+        return False
                        if n in r['url']:
                            results.remove(r)
-            # Escape stuff
+    # Check for Google captcha
-            query = discord.utils.escape_mentions(query)
+    test_search = f'{instance}/search?q=test&format=json&lang=en-US'
-            query = discord.utils.escape_markdown(query)
+    try:
        async with self.request.get(test_search) as resp:
            response = await resp.json()
        response['results'][0]['content']
    except (aiohttp.ClientError, KeyError, IndexError):
        return False
-            # Header
+    # Reached if passes all checks
-            msg = f"Showing **{amt}** results for `{query}`. \n\n"
+    return True
            # Expanded Result
            msg += (
                f"**{results[0]['title']}** <{results[0]['url']}>\n"
                f"{results[0]['content']}\n\n")
            # Other Results
            msg += "\n".join(
                [f"**{entry['title']}** <{entry['url']}>" for entry in results[1:5]])
            # Instance Info
            msg += f"\n\n_Results retrieved from instance `{instance}`._"
            return msg
-        # Reached if error with returned results
+@commands.command()
-        except (KeyError, IndexError) as e:
+@commands.is_owner()
-            # Logging
+async def rejson(self, ctx):
-            await self.warn(
+    """Refreshes the list of instances for searx."""
                f"A user encountered a(n) `{e}` with <{instance}> when searching for `{query}`. "
                "Consider removing it or looking into it.",
                name="Failed Instance"
            )
-            self.instances.remove(instance)  # Weed the instance out
+    msg = await ctx.send('<a:updating:403035325242540032> Refreshing instance list...\n\n'
-            # Recurse until good response
+                         '(Due to extensive quality checks, this may take a bit.)')
-            return await self._old_search_logic(query, is_nsfw)
+    plausible: List[str] = []
-    async def _instance_check(self, instance: str, content: dict) -> bool:
+    # Get, parse, and quality check all instances
-        """Checks the quality of an instance."""
+    async with self.request.get('https://searx.space/data/instances.json') as r:
        # Parsing
        searx_json = await r.json()
        instances = searx_json['instances']
-        # Makes sure proper values exist
+        # Quality Check
-        if 'error' in content:
+        for i in instances:
-            return False
+            content = instances.get(i)
-        if not ('engines' in content and 'initial' in content['timing']):
+            is_good: bool = await self._instance_check(i, content)
-            return False
+            if is_good:
-        if not ('google' in content['engines'] and 'enabled' in content['engines']['google']):
+                plausible.append(i)
            return False
-        # Makes sure google is enabled
+    # Save new list
-        if not content['engines']['google']['enabled']:
+    self.instances = plausible
-            return False
+    with open('searxes.txt', 'w') as f:
        f.write('\n'.join(plausible))
-        # Makes sure is not Tor
+    await msg.edit(content='Instances refreshed!')
        if content['network_type'] != 'normal':
            return False
-        # Only picks instances that are fast enough
+async def _old_basic_search(self, ctx, query: str,
-        timing = int(content['timing']['initial'])
+                            category: str = None):
-        if timing > 0.20:
+    """Base search message generation."""
            return False
-        # Check for Google captcha
+    async with ctx.typing():
-        test_search = f'{instance}/search?q=test&format=json&lang=en-US'
+        is_nsfw = (
-        try:
+            ctx.channel.is_nsfw() if hasattr(ctx.channel, 'is_nsfw')
-            async with self.request.get(test_search) as resp:
+            else False
-                response = await resp.json()
+        )
            response['results'][0]['content']
        except (aiohttp.ClientError, KeyError, IndexError):
            return False
-        # Reached if passes all checks
+        msg = await self._old_search_logic(query, is_nsfw, category)
-        return True
+        await ctx.send(msg)
-            @commands.command()
+        await self.info(
-    @commands.is_owner()
+            content=(
-    async def rejson(self, ctx):
+                f"**{ctx.author}** searched for `{query}` "
-        """Refreshes the list of instances for searx."""
+                f"in \"{ctx.guild}\" and got this:"
-
+                f"\n\n{msg}"
-        msg = await ctx.send('<a:updating:403035325242540032> Refreshing instance list...\n\n'
+            ),
-                             '(Due to extensive quality checks, this may take a bit.)')
+            name="Search Results"
-        plausible: List[str] = []
+        )
        # Get, parse, and quality check all instances
        async with self.request.get('https://searx.space/data/instances.json') as r:
            # Parsing
            searx_json = await r.json()
            instances = searx_json['instances']
            # Quality Check
            for i in instances:
                content = instances.get(i)
                is_good: bool = await self._instance_check(i, content)
                if is_good:
                    plausible.append(i)
        # Save new list
        self.instances = plausible
        with open('searxes.txt', 'w') as f:
            f.write('\n'.join(plausible))
        await msg.edit(content='Instances refreshed!')
            async def _old_basic_search(self, ctx, query: str,
                                category: str = None):
        """Base search message generation."""
        async with ctx.typing():
            is_nsfw = (
                ctx.channel.is_nsfw() if hasattr(ctx.channel, 'is_nsfw')
                else False
            )
            msg = await self._old_search_logic(query, is_nsfw, category)
            await ctx.send(msg)
            await self.info(
                content=(
                    f"**{ctx.author}** searched for `{query}` "
                    f"in \"{ctx.guild}\" and got this:"
                    f"\n\n{msg}"
                ),
                name="Search Results"
            )