From cbea2c6440cac836624ee46b7c521d923439d914 Mon Sep 17 00:00:00 2001 From: Adriene Hutchins Date: Tue, 7 Apr 2020 17:40:42 -0400 Subject: [PATCH] Initial work on pagination --- extensions/models/search_source.py | 176 ++++++++++++++ extensions/search.py | 78 +++++- main.py | 9 +- old_search.py | 373 +++++++++++++++-------------- 4 files changed, 442 insertions(+), 194 deletions(-) create mode 100644 extensions/models/search_source.py diff --git a/extensions/models/search_source.py b/extensions/models/search_source.py new file mode 100644 index 0000000..d2944c8 --- /dev/null +++ b/extensions/models/search_source.py @@ -0,0 +1,176 @@ +# -*- coding: utf-8 -*- + +# search source +# Provides paginator sources for the search cog. + +"""Search Source File""" + +from typing import Callable, List, Tuple, Optional, Any + +import discord +from discord.ext import menus +import html2text +import re + +FetcherArgs = Tuple[Any] +Fetcher = Callable[..., List] + +# Markdown converter +tomd = html2text.HTML2Text() +tomd.ignore_links = True +tomd.ignore_images = True +tomd.ignore_tables = True +tomd.ignore_emphasis = True +tomd.body_width = 0 + + +# TODO Change around value names, make it general +class Result: + """A class that holds the general data for a search result. + + Parameters: + + title (str): Title of the content. + + url (str): The direct link to the content. + + desc (str): The content's description. + + source (Optional[str]): The source site. Defaults to url. + + image (Optional[str]): The content's image. + """ + + def __init__(self, title: str, url: str, + desc: str = "No description provided.", + source: Optional[str] = None, image: Optional[str] = None): + self.url = url + if title in [None, ""]: + self.title = "Unknown" + else: + self.title = title + self.desc = desc + self.source = source + self.image = image + + def __repr__(self): + fmt = f'' + return fmt + + +class NormalSource(menus.AsyncIteratorPageSource): + def __init__(self, query: str, fetcher: FetcherArgs, per_page: int, + header: str = "", footer: str = ""): + self.header = header + self.footer = footer + self.query = query + + super().__init__(self._generate(fetcher), per_page=per_page) + + async def _generate(self, fetcher: Fetcher): + offset = 0 + per_request = 10 + # TODO put the generation in the fetcher itself + # Qwant: image - media, source - url, title - title + while results := await fetcher( + offset, per_request, self.query + ): + results + for r in results: + yield r + offset += per_request + + async def format_page(self, menu, entries): + start = menu.current_page * self.per_page + + # Escapes all nasties for displaying + query_display = discord.utils.escape_mentions(self.query) + query_display = discord.utils.escape_markdown(query_display) + + # Return if no results + try: + entries[0] + except IndexError: + return f"No results found for `{query_display}`." + + # Gets the first entry's stuff + first_title = tomd.handle(entries[0].title).rstrip('\n') + first_url = entries[0].url + if start == 0: + first_desc = tomd.handle(entries[0].desc).rstrip('\n') + first = f"**{first_title}** {first_url}\n{first_desc}\n\n" + else: + first = f"**{first_title}** {first_url}\n" + + # Builds the substring for each of the other results. + other_results: List[str] = [] + + for e in entries[1:5]: + title = tomd.handle(e.title).rstrip('\n') + url = e.url + other_results.append(f"**{title}** {url}") + + other_msg = "\n".join(other_results) + + # Builds message + msg = f"{first}{other_msg}" + msg = re.sub( + r'(https?://(?:www\.)?[-a-zA-Z0-9@:%._+~#=]+\.' + r'[a-zA-Z0-9()]+\b[-a-zA-Z0-9()@:%_+.~#?&/=]*)', + r'<\1>', + msg + ) + + content = ( + f"{self.header}\n\n" + f"Showing results *{start} - {start + 5}* " + f"for `{self.query}`.\n\n" + f"{msg}\n\n" + f"{self.footer}" + ) + + return content + + +class ImageSource(menus.AsyncIteratorPageSource): + def __init__(self, query: str, fetcher: FetcherArgs, args: FetcherArgs, + header: str = "", footer: str = ""): + self.header = header + self.footer = footer + self.query = query + super().__init__(self._generate(fetcher, args), per_page=1) + + async def _generate(self, fetcher: Fetcher, fetch_args: FetcherArgs): + offset = 0 + per_request = 10 + # TODO put the generation in the fetcher itself + # Qwant: image - media, source - url, title - title + while results := await fetcher( + offset, per_request, self.query, *fetch_args + ): + results + for r in results: + yield r + offset += per_request + + async def format_page(self, menu, entry): + start = menu.current_page * self.per_page + + content = ( + f"{self.header}\n\n" + f"Showing image result `{start}` for `{self.query}``.\n\n" + f"<{entry.image}>" + f"{self.footer}\n\n" + ) + + embed = discord.Embed( + title=entry.title, + url=entry.image, + description=entry.source + ) + embed.set_image(url=entry.image) + + return { + "content": content, + "embed": embed + } diff --git a/extensions/search.py b/extensions/search.py index 5384413..3ba3e36 100644 --- a/extensions/search.py +++ b/extensions/search.py @@ -8,12 +8,13 @@ from typing import List import discord -from discord.ext import commands +from discord.ext import commands, menus import html2text import re from urllib.parse import quote_plus from extensions.models import SearchExceptions +from extensions.models.search_source import Result, NormalSource, ImageSource class Search(commands.Cog, name="Basic"): @@ -39,7 +40,8 @@ class Search(commands.Cog, name="Basic"): self.tomd.body_width = 0 async def _search_logic(self, query: str, is_nsfw: bool = False, - category: str = 'web', count: int = 5) -> list: + category: str = 'web', count: int = 5, + offset: int = 0) -> list: """Uses scrapestack and the Qwant API to find search results.""" # Typing @@ -87,9 +89,12 @@ class Search(commands.Cog, name="Basic"): search_url = ( f"{base}/search/{category}" f"?count={count}" + f"&offset={offset}" f"&q={query}" f"&safesearch={safesearch}" - "&t=web" + f"&t={category}" + "&extensionDisabled=true" + "&device=tablet" "&locale=en_US" "&uiv=4" ) @@ -113,11 +118,47 @@ class Search(commands.Cog, name="Basic"): } async with self.request.get(search_url, headers=headers) as resp: to_parse = await resp.json() - print(to_parse) # Sends results return to_parse['data']['result']['items'] + async def _page_search(self, ctx, query: str, count:int = 5, + category: str = 'web'): + """Basic search formatting - this time with pages!""" + + is_nsfw = ( + ctx.channel.is_nsfw() if hasattr(ctx.channel, 'is_nsfw') + else False + ) + + async def fetcher(offset, per_request, q): + result_objects = [] + results = await self._search_logic( + query, is_nsfw, category, per_request, offset + ) + + for r in results: + result = Result( + title=r["title"], + url=r["url"], + desc=r["desc"], + source=r["source"] + ) + result_objects.append(result) + + return result_objects + + pages = menus.MenuPages( + source=NormalSource( + query, fetcher, count, + footer="_Powered by Qwant._" + ), + clear_reactions_after=True, + ) + await pages.start(ctx) + + + async def _basic_search(self, ctx, query: str, category: str = 'web'): """Basic search formatting.""" @@ -169,8 +210,6 @@ class Search(commands.Cog, name="Basic"): f"{other_msg}\n\n_Powered by Qwant._" ) - print(msg) - msg = re.sub( r'(https?://(?:www\.)?[-a-zA-Z0-9@:%._+~#=]+\.' r'[a-zA-Z0-9()]+\b[-a-zA-Z0-9()@:%_+.~#?&/=]*)', @@ -178,7 +217,6 @@ class Search(commands.Cog, name="Basic"): msg ) - # Sends message await self.info( f"**New Search** - `{ctx.author}` in `{ctx.guild}`\n\n{msg}", @@ -186,6 +224,32 @@ class Search(commands.Cog, name="Basic"): ) await ctx.send(msg) + @commands.command() + async def paginated_search(self, ctx, *, query: str): + + async with ctx.typing(): + await self._page_search(ctx, query) + + # async def fetcher(offset, per_request, q, *args): + # result_objects = [] + # results = await self._search_logic( + # q, False, "images", per_request, offset) + # for r in results: + # image = Result( + # title=r["title"], + # url=r["media"], + # source=r["url"], + # image=r["media"] + # ) + # result_objects.append(image) + # return result_objects + + # pages = menus.MenuPages( + # source=ImageSource(query, fetcher, (None,)), + # clear_reactions_after=True) + # await pages.start(ctx) + + @commands.command() async def search(self, ctx, *, query: str): """Search online for general results.""" diff --git a/main.py b/main.py index 080e91f..7096d62 100644 --- a/main.py +++ b/main.py @@ -5,7 +5,8 @@ # Used and modified with permission. # See LICENSE for license information. -'''Main File''' + +"""Main File""" import json import os @@ -15,13 +16,14 @@ from typing import List import aiohttp import discord from discord.ext import commands +from discord.ext.menus import CannotAddReactions import rethinkdb from extensions.models import SearchExceptions class Bot(commands.Bot): - """Custom Bot Class that subclasses the commands.ext one""" + """Custom Bot Class that subclasses the commands.ext one.""" def __init__(self, **options): """Initializes the main parts of the bot.""" @@ -271,6 +273,9 @@ async def on_command_error(ctx, error): "Please try again in an NSFW channel." ) + elif isinstance(error, CannotAddReactions): + await ctx.send("**I cannot add reactions for pagination here!**") + # Provides a very pretty embed if something's actually a dev's fault. elif isinstance(error, commands.CommandInvokeError): diff --git a/old_search.py b/old_search.py index 4f3ee95..02fa5c0 100644 --- a/old_search.py +++ b/old_search.py @@ -1,211 +1,214 @@ -# This is the old search logic for reference purposes - - async def _old_search_logic(self, query: str, is_nsfw: bool = False, - category: str = None) -> str: - """Provides search logic for all search commands.""" +# This is the old search logic for reference purposes - # NSFW Filtering - # WARNING - This list includes slurs. - nono_words = [ - 'tranny', 'faggot', 'fag', - 'porn', 'cock', 'dick', - 'titty', 'boob', 'penis', - 'slut', 'cum', 'jizz', - 'semen', 'cooch', 'coochie', - 'pussy', 'penis', 'fetish', - 'bdsm', 'sexy', 'xxx', - 'orgasm', 'masturbation', - 'erotic', 'creampie', - 'fap', 'nude', 'orgasm', - 'squirting', 'yiff', - 'e621' - ] - nono_sites = [ - 'xvideos', 'pornhub', - 'xhamster', 'xnxx', - 'youporn', 'xxx', - 'freexcafe', 'sex.com', - 'e621', 'nhentai' - ] +async def _old_search_logic(self, query: str, is_nsfw: bool = False, + category: str = None) -> str: + """Provides search logic for all search commands.""" + + # NSFW Filtering + # WARNING - This list includes slurs. + nono_words = [ + 'tranny', 'faggot', 'fag', + 'porn', 'cock', 'dick', + 'titty', 'boob', 'penis', + 'slut', 'cum', 'jizz', + 'semen', 'cooch', 'coochie', + 'pussy', 'penis', 'fetish', + 'bdsm', 'sexy', 'xxx', + 'orgasm', 'masturbation', + 'erotic', 'creampie', + 'fap', 'nude', 'orgasm', + 'squirting', 'yiff', + 'e621' + ] + nono_sites = [ + 'xvideos', 'pornhub', + 'xhamster', 'xnxx', + 'youporn', 'xxx', + 'freexcafe', 'sex.com', + 'e621', 'nhentai' + ] + + if not is_nsfw: + for i in nono_words: + if i in query.replace(" ", ""): + return ( + "**Sorry!** That query included language " + "we cannot accept in a non-NSFW channel. " + "Please try again in an NSFW channel." + ) + + # Choose an instance + if self.instances == []: + with open('searxes.txt') as f: + self.instances = f.read().split('\n') + instance = random.sample(self.instances, k=1)[0] + + # Error Template + error_msg = ( + "**An error occured!**\n\n" + f"There was a problem with `{instance}`. Please try again later.\n" + f"_If problems with this instance persist, " + f"contact`{self.bot.appinfo.owner}` to have it removed._" + ) + + # Create the URL to make an API call to + call = f'{instance}search?q={query}&format=json&language=en-US' + + # If a type is provided, add that type to the call URL + if category: + call += f'&categories={category}' + + if is_nsfw: + call += '&safesearch=0' + else: + call += '&safesearch=1' + + # Figure out engines for different categories to get decent results. + if category == 'videos': + call += '&engines=bing+videos,google+videos' + # Make said API call + try: + async with self.request.get(call) as resp: + response = await resp.json() + except aiohttp.ClientError: + return error_msg + + # Split our response data up for parsing + # infoboxes = response['infoboxes'] + results = response['results'] + + # Create message with results + try: + # Handle tiny result count + if len(results) > 5: + amt = 5 + else: + amt = len(results) + + # Remove no-no sites if not is_nsfw: - for i in nono_words: - if i in query.replace(" ", ""): - return ( - "**Sorry!** That query included language " - "we cannot accept in a non-NSFW channel. " - "Please try again in an NSFW channel." - ) + for r in results[0:7]: + for n in nono_sites: + if n in r['url']: + results.remove(r) - # Choose an instance - if self.instances == []: - with open('searxes.txt') as f: - self.instances = f.read().split('\n') - instance = random.sample(self.instances, k=1)[0] + # Escape stuff + query = discord.utils.escape_mentions(query) + query = discord.utils.escape_markdown(query) - # Error Template - error_msg = ( - "**An error occured!**\n\n" - f"There was a problem with `{instance}`. Please try again later.\n" - f"_If problems with this instance persist, " - f"contact`{self.bot.appinfo.owner}` to have it removed._" + # Header + msg = f"Showing **{amt}** results for `{query}`. \n\n" + # Expanded Result + msg += ( + f"**{results[0]['title']}** <{results[0]['url']}>\n" + f"{results[0]['content']}\n\n") + # Other Results + msg += "\n".join( + [f"**{entry['title']}** <{entry['url']}>" for entry in results[1:5]]) + # Instance Info + msg += f"\n\n_Results retrieved from instance `{instance}`._" + + return msg + + # Reached if error with returned results + except (KeyError, IndexError) as e: + # Logging + await self.warn( + f"A user encountered a(n) `{e}` with <{instance}> when searching for `{query}`. " + "Consider removing it or looking into it.", + name="Failed Instance" ) - # Create the URL to make an API call to - call = f'{instance}search?q={query}&format=json&language=en-US' + self.instances.remove(instance) # Weed the instance out + # Recurse until good response + return await self._old_search_logic(query, is_nsfw) - # If a type is provided, add that type to the call URL - if category: - call += f'&categories={category}' - if is_nsfw: - call += '&safesearch=0' - else: - call += '&safesearch=1' +async def _instance_check(self, instance: str, content: dict) -> bool: + """Checks the quality of an instance.""" - # Figure out engines for different categories to get decent results. - if category == 'videos': - call += '&engines=bing+videos,google+videos' - # Make said API call - try: - async with self.request.get(call) as resp: - response = await resp.json() - except aiohttp.ClientError: - return error_msg + # Makes sure proper values exist + if 'error' in content: + return False + if not ('engines' in content and 'initial' in content['timing']): + return False + if not ('google' in content['engines'] and 'enabled' in content['engines']['google']): + return False - # Split our response data up for parsing - # infoboxes = response['infoboxes'] - results = response['results'] + # Makes sure google is enabled + if not content['engines']['google']['enabled']: + return False - # Create message with results - try: - # Handle tiny result count - if len(results) > 5: - amt = 5 - else: - amt = len(results) + # Makes sure is not Tor + if content['network_type'] != 'normal': + return False - # Remove no-no sites - if not is_nsfw: - for r in results[0:7]: - for n in nono_sites: - if n in r['url']: - results.remove(r) + # Only picks instances that are fast enough + timing = int(content['timing']['initial']) + if timing > 0.20: + return False - # Escape stuff - query = discord.utils.escape_mentions(query) - query = discord.utils.escape_markdown(query) + # Check for Google captcha + test_search = f'{instance}/search?q=test&format=json&lang=en-US' + try: + async with self.request.get(test_search) as resp: + response = await resp.json() + response['results'][0]['content'] + except (aiohttp.ClientError, KeyError, IndexError): + return False - # Header - msg = f"Showing **{amt}** results for `{query}`. \n\n" - # Expanded Result - msg += ( - f"**{results[0]['title']}** <{results[0]['url']}>\n" - f"{results[0]['content']}\n\n") - # Other Results - msg += "\n".join( - [f"**{entry['title']}** <{entry['url']}>" for entry in results[1:5]]) - # Instance Info - msg += f"\n\n_Results retrieved from instance `{instance}`._" + # Reached if passes all checks + return True - return msg - # Reached if error with returned results - except (KeyError, IndexError) as e: - # Logging - await self.warn( - f"A user encountered a(n) `{e}` with <{instance}> when searching for `{query}`. " - "Consider removing it or looking into it.", - name="Failed Instance" - ) +@commands.command() +@commands.is_owner() +async def rejson(self, ctx): + """Refreshes the list of instances for searx.""" - self.instances.remove(instance) # Weed the instance out - # Recurse until good response - return await self._old_search_logic(query, is_nsfw) + msg = await ctx.send(' Refreshing instance list...\n\n' + '(Due to extensive quality checks, this may take a bit.)') + plausible: List[str] = [] - async def _instance_check(self, instance: str, content: dict) -> bool: - """Checks the quality of an instance.""" + # Get, parse, and quality check all instances + async with self.request.get('https://searx.space/data/instances.json') as r: + # Parsing + searx_json = await r.json() + instances = searx_json['instances'] - # Makes sure proper values exist - if 'error' in content: - return False - if not ('engines' in content and 'initial' in content['timing']): - return False - if not ('google' in content['engines'] and 'enabled' in content['engines']['google']): - return False + # Quality Check + for i in instances: + content = instances.get(i) + is_good: bool = await self._instance_check(i, content) + if is_good: + plausible.append(i) - # Makes sure google is enabled - if not content['engines']['google']['enabled']: - return False + # Save new list + self.instances = plausible + with open('searxes.txt', 'w') as f: + f.write('\n'.join(plausible)) - # Makes sure is not Tor - if content['network_type'] != 'normal': - return False + await msg.edit(content='Instances refreshed!') - # Only picks instances that are fast enough - timing = int(content['timing']['initial']) - if timing > 0.20: - return False +async def _old_basic_search(self, ctx, query: str, + category: str = None): + """Base search message generation.""" - # Check for Google captcha - test_search = f'{instance}/search?q=test&format=json&lang=en-US' - try: - async with self.request.get(test_search) as resp: - response = await resp.json() - response['results'][0]['content'] - except (aiohttp.ClientError, KeyError, IndexError): - return False + async with ctx.typing(): + is_nsfw = ( + ctx.channel.is_nsfw() if hasattr(ctx.channel, 'is_nsfw') + else False + ) - # Reached if passes all checks - return True + msg = await self._old_search_logic(query, is_nsfw, category) + await ctx.send(msg) - @commands.command() - @commands.is_owner() - async def rejson(self, ctx): - """Refreshes the list of instances for searx.""" - - msg = await ctx.send(' Refreshing instance list...\n\n' - '(Due to extensive quality checks, this may take a bit.)') - plausible: List[str] = [] - - # Get, parse, and quality check all instances - async with self.request.get('https://searx.space/data/instances.json') as r: - # Parsing - searx_json = await r.json() - instances = searx_json['instances'] - - # Quality Check - for i in instances: - content = instances.get(i) - is_good: bool = await self._instance_check(i, content) - if is_good: - plausible.append(i) - - # Save new list - self.instances = plausible - with open('searxes.txt', 'w') as f: - f.write('\n'.join(plausible)) - - await msg.edit(content='Instances refreshed!') - - async def _old_basic_search(self, ctx, query: str, - category: str = None): - """Base search message generation.""" - - async with ctx.typing(): - is_nsfw = ( - ctx.channel.is_nsfw() if hasattr(ctx.channel, 'is_nsfw') - else False - ) - - msg = await self._old_search_logic(query, is_nsfw, category) - await ctx.send(msg) - - await self.info( - content=( - f"**{ctx.author}** searched for `{query}` " - f"in \"{ctx.guild}\" and got this:" - f"\n\n{msg}" - ), - name="Search Results" - ) \ No newline at end of file + await self.info( + content=( + f"**{ctx.author}** searched for `{query}` " + f"in \"{ctx.guild}\" and got this:" + f"\n\n{msg}" + ), + name="Search Results" + )