From 391beaf4223ab026f0eeb702ca757023debe7e63 Mon Sep 17 00:00:00 2001 From: Adriene Hutchins Date: Tue, 24 Mar 2020 19:09:01 -0400 Subject: [PATCH] Moving search to Qwant --- extensions/core.py | 8 + extensions/models/SearchExceptions.py | 17 ++ extensions/search.py | 352 +++++++++++--------------- extensions/utils/logging.py | 38 +-- main.py | 25 +- old_search.py | 211 +++++++++++++++ 6 files changed, 422 insertions(+), 229 deletions(-) create mode 100644 extensions/models/SearchExceptions.py create mode 100644 old_search.py diff --git a/extensions/core.py b/extensions/core.py index 2571f90..467739c 100644 --- a/extensions/core.py +++ b/extensions/core.py @@ -267,6 +267,14 @@ Guild count: {len(self.bot.guilds)} # Message Sending await ctx.send(msg) + @commands.command() + @commands.is_owner() + async def toggle_debug(self, ctx): + """Toggles debug while running.""" + + self.bot.debug_toggle = not self.bot.debug_toggle + await ctx.send(f"Set debug mode to `{self.bot.debug_toggle}`.") + @commands.command(aliases=['exit', 'reboot']) @commands.is_owner() async def restart(self, ctx): diff --git a/extensions/models/SearchExceptions.py b/extensions/models/SearchExceptions.py new file mode 100644 index 0000000..1f93054 --- /dev/null +++ b/extensions/models/SearchExceptions.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- + +# search exceptions +# Provides custom exceptions for the search cog. + +"""Search Exceptions File""" + +from discord.ext import commands + + +class SafesearchFail(commands.CommandError): + """Thrown when a query contains NSFW content.""" + pass + + +def setup(bot): + pass diff --git a/extensions/search.py b/extensions/search.py index 781bd71..132af17 100644 --- a/extensions/search.py +++ b/extensions/search.py @@ -10,6 +10,8 @@ from discord.ext import commands import aiohttp import random from typing import List +from extensions.models import SearchExceptions +import html2text class Search(commands.Cog, name="Basic"): @@ -21,193 +23,153 @@ class Search(commands.Cog, name="Basic"): self.bot = bot self.info = bot.logging.info self.warn = bot.logging.warn + self.debug = bot.logging.debug self.request = bot.request self.emoji = "\U0001F50D" + self.scrape_token = bot.config['SCRAPESTACK'] - # Get Instances - with open('searxes.txt') as f: - self.instances = f.read().split('\n') + # Markdown converter + self.tomd = html2text.HTML2Text() + self.tomd.ignore_links = True + self.tomd.ignore_images = True + self.tomd.ignore_tables = True + self.tomd.ignore_emphasis = True + self.tomd.body_width = 0 async def _search_logic(self, query: str, is_nsfw: bool = False, - category: str = None) -> str: - """Provides search logic for all search commands.""" + category: str = 'web', count: int = 5) -> list: + """Uses scrapestack and the Qwant API to find search results.""" + + # Typing + base: str + safesearch: str # NSFW Filtering - # WARNING - This list includes slurs. + # WARNING This list includes slurs. nono_words = [ - 'tranny', 'faggot', 'fag', - 'porn', 'cock', 'dick', - 'titty', 'boob', 'penis', - 'slut', 'cum', 'jizz', - 'semen', 'cooch', 'coochie', - 'pussy', 'penis', 'fetish', - 'bdsm', 'sexy', 'xxx', - 'orgasm', 'masturbation', - 'erotic', 'creampie', - 'fap', 'nude', 'orgasm', - 'squirting', 'yiff', - 'e621' - ] - nono_sites = [ - 'xvideos', 'pornhub', - 'xhamster', 'xnxx', - 'youporn', 'xxx', - 'freexcafe', 'sex.com', - 'e621' + 'tranny', 'faggot', 'fag', 'porn', 'cock', 'dick', + 'titty', ' tit ', 'boob', 'penis', 'slut', ' cum ', 'jizz', + 'semen', 'cooch', 'coochie', 'pussy', 'penis', 'fetish', + 'bdsm', 'sexy', 'xxx', 'orgasm', 'masturbat', + 'erotic', 'creampie', 'fap', 'nude', 'orgasm', + 'squirting', 'yiff', 'e621', ' sex', 'ejaculat', + 'cunt', 'vagina', 'coom', 'troon', 'hentai', 'yaoi', + 'bukkake', 'bara', 'shota', 'loli', 'fetish', 'spunk', + 'pron', 'p0rn', 'pr0n', 'gloryhole', 'felch', 'skullfuck', + 'scat', 'pissplay', 'piss play', 'underage', 'bbw', + 'fisting', 'queef', "rimming", 'rimjob', 'bdsm', + 'cbt', 'blumpkin', 'boner', 'prostitut', 'butt plug', + 'transvestite', 'femboy', 'castrat', 'philia', 'edging', + 'edgeplay', 'enema', 'facial', 'fellat', 'femdom', 'footjob', + 'blowjob', 'titjob', 'handjob', 'frot', 'gang bang', 'gangbang', + 'glory hole', 'hermap', 'jerk off', 'jerking off', 'jack off', + 'jacking off', 'kink', 'wet dream', 'anal', 'pegging', 'precum', + 'pre-cum', 'pre cum', 'priap', 'scrotum', 'shemale', 'smegma', + 'smut', 'softcore', 'transsexual', 'voyeur', 'viagra', 'wank', + 'whore' ] - if not is_nsfw: - for i in nono_words: - if i in query.replace(" ", ""): - return ( - "**Sorry!** That query included language " - "we cannot accept in a non-NSFW channel. " - "Please try again in an NSFW channel." - ) + if any(n in query for n in nono_words): + raise SearchExceptions.SafesearchFail('Query had NSFW.') - # Choose an instance - if self.instances == []: - with open('searxes.txt') as f: - self.instances = f.read().split('\n') - instance = random.sample(self.instances, k=1)[0] + # Scrape or not + # if self.scrape_token != '': + # base = ( + # "http://api.scrapestack.com/scrape" + # f"?access_key={self.scrape_token}" + # f"&url=https://api.qwant.com/api" + # ) + # print(base) + # else: + base = "https://api.qwant.com/api" - # Error Template - error_msg = ( - "**An error occured!**\n\n" - f"There was a problem with `{instance}`. Please try again later.\n" - f"_If problems with this instance persist, contact`{self.bot.appinfo.owner}` to have it removed._" + # Safesearch + if is_nsfw: + safesearch = "0" + else: + safesearch = "2" + + # Search URL Building + # api.qwant.com/api/search/web?count=5&q=test&safesearch=2&... + search_url = ( + f"{base}/search/{category}" + f"?count={count}" + f"&q={query}" + f"&safesearch={safesearch}" + "&t=web" + "&locale=en_US" + "&uiv=4" + ) + await self.debug(search_url, name="_search_logic") + + # Searching + headers = { + 'User-Agent': ( + 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:74.0)' + ' Gecko/20100101 Firefox/74.0' + ) + } + async with self.request.get(search_url, headers=headers) as resp: + to_parse = await resp.json() + + # Sends results + return to_parse['data']['result']['items'] + + async def _basic_search(self, ctx, query: str, category: str = 'web'): + """Basic search formatting.""" + + # NOTE Customizable count not yet implemented. + count: int = 5 + + # Safesearch variable + is_nsfw = ( + ctx.channel.is_nsfw() if hasattr(ctx.channel, 'is_nsfw') + else False ) - # Create the URL to make an API call to - call = f'{instance}search?q={query}&format=json&language=en-US' - - # If a type is provided, add that type to the call URL - if category: - call += f'&categories={category}' - - if is_nsfw: - call += '&safesearch=0' - else: - call += '&safesearch=1' - - # Figure out engines for different categories to get decent results. - if category == 'videos': - call += '&engines=bing+videos,google+videos' - # Make said API call - try: - async with self.request.get(call) as resp: - response = await resp.json() - except aiohttp.ClientError: - return error_msg - - # Split our response data up for parsing - # infoboxes = response['infoboxes'] - results = response['results'] - - # Create message with results - try: - # Handle tiny result count - if len(results) > 5: - amt = 5 - else: - amt = len(results) - - # Remove no-no sites - if not is_nsfw: - for r in results[0:7]: - for n in nono_sites: - if n in r['url']: - results.remove(r) - - # Escape stuff - query = discord.utils.escape_mentions(query) - query = discord.utils.escape_markdown(query) - - # Header - msg = f"Showing **{amt}** results for `{query}`. \n\n" - # Expanded Result - msg += ( - f"**{results[0]['title']}** <{results[0]['url']}>\n" - f"{results[0]['content']}\n\n") - # Other Results - msg += "\n".join( - [f"**{entry['title']}** <{entry['url']}>" for entry in results[1:5]]) - # Instance Info - msg += f"\n\n_Results retrieved from instance `{instance}`._" - - return msg - - # Reached if error with returned results - except (KeyError, IndexError) as e: - # Logging - await self.warn( - f"A user encountered a(n) `{e}` with <{instance}> when searching for `{query}`. " - "Consider removing it or looking into it.", - name="Failed Instance" - ) - - self.instances.remove(instance) # Weed the instance out - # Recurse until good response - return await self._search_logic(query, is_nsfw) - - async def _instance_check(self, instance: str, content: dict) -> bool: - """Checks the quality of an instance.""" - - # Makes sure proper values exist - if 'error' in content: - return False - if not ('engines' in content and 'initial' in content['timing']): - return False - if not ('google' in content['engines'] and 'enabled' in content['engines']['google']): - return False - - # Makes sure google is enabled - if not content['engines']['google']['enabled']: - return False - - # Makes sure is not Tor - if content['network_type'] != 'normal': - return False - - # Only picks instances that are fast enough - timing = int(content['timing']['initial']) - if timing > 0.20: - return False - - # Check for Google captcha - test_search = f'{instance}/search?q=test&format=json&lang=en-US' - try: - async with self.request.get(test_search) as resp: - response = await resp.json() - response['results'][0]['content'] - except (aiohttp.ClientError, KeyError, IndexError): - return False - - # Reached if passes all checks - return True - - async def _basic_search(self, ctx, query: str, - category: str = None): - """Base search message generation.""" - + # Handling async with ctx.typing(): - is_nsfw = ( - ctx.channel.is_nsfw() if hasattr(ctx.channel, 'is_nsfw') - else False + + # Searches + results = await self._search_logic(query, is_nsfw, category) + count = len(results) + + # Escapes all nasties for displaying + query_display = discord.utils.escape_mentions(query) + query_display = discord.utils.escape_markdown(query_display) + + # Return if no results + try: + results[0] + except IndexError: + return await ctx.send( + f"No results found for `{query_display}`." + ) + + # Gets the first entry's stuff + first_title = self.tomd.handle(results[0]['title']).rstrip('\n') + first_url = results[0]['url'] + first_desc = self.tomd.handle(results[0]['desc']).rstrip('\n') + + # Builds the substring for each of the other results. + other_results: List[str] = [] + for r in results[1:count]: + title = self.tomd.handle(r['title']).rstrip('\n') + url = results[0]['url'] + other_results.append(f"**{title}** <{url}>") + other_msg: str = "\n".join(other_results) + + # Builds message + msg = ( + f"Showing **{count}** results for `{query_display}`.\n\n" + f"**{first_title}** <{first_url}>\n{first_desc}\n\n" + f"{other_msg}\n\n_Powered by Qwant._" ) - msg = await self._search_logic(query, is_nsfw, category) + # Sends message + await self.debug(msg, name="_basic_search") await ctx.send(msg) - await self.info( - content=( - f"**{ctx.author}** searched for `{query}` " - f"in \"{ctx.guild}\" and got this:" - f"\n\n{msg}" - ), - name="Search Results" - ) - @commands.command() async def search(self, ctx, *, query: str): """Search online for general results.""" @@ -250,51 +212,27 @@ class Search(commands.Cog, name="Basic"): await self._basic_search(ctx, query, 'maps') - @commands.command() - @commands.is_owner() - async def rejson(self, ctx): - """Refreshes the list of instances for searx.""" - - msg = await ctx.send(' Refreshing instance list...\n\n' - '(Due to extensive quality checks, this may take a bit.)') - plausible: List[str] = [] - - # Get, parse, and quality check all instances - async with self.request.get('https://searx.space/data/instances.json') as r: - # Parsing - searx_json = await r.json() - instances = searx_json['instances'] - - # Quality Check - for i in instances: - content = instances.get(i) - is_good: bool = await self._instance_check(i, content) - if is_good: - plausible.append(i) - - # Save new list - self.instances = plausible - with open('searxes.txt', 'w') as f: - f.write('\n'.join(plausible)) - - await msg.edit(content='Instances refreshed!') - @commands.Cog.listener() async def on_command_error(self, ctx, error): """Listener makes no command fallback to searching.""" - if isinstance(error, commands.CommandNotFound) or \ - isinstance(error, commands.CheckFailure): + fallback = (commands.CommandNotFound, commands.CheckFailure) - # Handling - async with ctx.typing(): - # Prepares term - term = ctx.message.content.replace(ctx.prefix, '', 1) - term = term.lstrip(' ') - - # Does search - await self._basic_search(ctx, term) + if isinstance(error, fallback): + try: + await self._basic_search( + ctx, ctx.message.content[len(ctx.prefix):] + ) + except SearchExceptions.SafesearchFail: + await ctx.send( + "**Sorry!** That query included language " + "we cannot accept in a non-NSFW channel. " + "Please try again in an NSFW channel." + ) + except Exception as e: + print(e) def setup(bot): + bot.add_cog(Search(bot)) diff --git a/extensions/utils/logging.py b/extensions/utils/logging.py index de531a9..a7e3eb9 100644 --- a/extensions/utils/logging.py +++ b/extensions/utils/logging.py @@ -19,13 +19,13 @@ class Logging(): self.request = bot.request self.online = bot.online self.maintenance = bot.maintenance + self.debug_toggle = bot.debug_toggle # Sets info hook first self.info_hook = self.online.get_webhook( bot.config['HOOKS']['INFO_HOOK']) \ if bot.config['HOOKS']['INFO_HOOK'] \ else None - # Sets other hooks or defaults them if self.info_hook: @@ -51,7 +51,7 @@ class Logging(): # Prerequisites formatted_tb = traceback.format_tb(error.__traceback__) - formatted_tb = ''.join(formatted_tb) + tb_str = ''.join(formatted_tb) original_exc = traceback.format_exception( type(error), error, error.__traceback__) @@ -77,7 +77,7 @@ class Logging(): trace_content = ( "```py\n\nTraceback (most recent call last):" "\n{}{}: {}```").format( - formatted_tb, + tb_str, type(error).__name__, error) @@ -95,8 +95,8 @@ class Logging(): # Provides completed embed return error_embed - async def info(self, content: str, - embed: Optional[discord.Embed] = None, + async def info(self, content: str, + embed: Optional[discord.Embed] = None, name: Optional[str] = None): """Logs info and sends it to the appropriate places.""" @@ -112,15 +112,17 @@ class Logging(): else: return - async def warn(self, content: str, - embed: Optional[discord.Embed] = None, - name: Optional[str] = None): + async def warn(self, content: str, + embed: Optional[discord.Embed] = None, + name: Optional[str] = None): """Logs warnings and sends them to the appropriate places.""" if self.warn_hook: return await self.warn_hook.send( content=content, - username=f"{self.bot.user.name} - {name if name else 'unknown'}", + username=( + f"{self.bot.user.name} - {name if name else 'unknown'}" + ), avatar_url=str(self.bot.user.avatar_url), embed=embed ) @@ -129,7 +131,7 @@ class Logging(): async def error(self, error: Exception, ctx: Context, name: Optional[str]): """Logs errors and sends them to the appropriate places.""" - + # Prerequisites error_embed = await self._create_error_embed(error, ctx) @@ -144,7 +146,9 @@ class Logging(): ) await self.error_hook.send( content=fallback, - username=f"{self.bot.user.name} - {name if name else 'unknown'}", + username=( + f"{self.bot.user.name} - {name if name else 'unknown'}" + ), avatar_url=str(self.bot.user.avatar_url), embed=error_embed ) @@ -163,15 +167,17 @@ class Logging(): ) return error_embed - async def debug(self, content: str, - embed: Optional[discord.Embed] = None, + async def debug(self, content: str, + embed: Optional[discord.Embed] = None, name: Optional[str] = None): """Logs warnings and sends them to the appropriate places.""" - if self.debug_hook and self.maintenance: + if self.debug_hook and (self.maintenance or self.debug_toggle): return await self.debug_hook.send( - content=content, - username=f"{self.bot.user.name} - {name if name else 'unknown'}", + content=f"```{content}```", + username=( + f"{self.bot.user.name} - {name if name else 'unknown'}" + ), avatar_url=str(self.bot.user.avatar_url), embed=embed ) diff --git a/main.py b/main.py index e6e3bbb..8477ceb 100644 --- a/main.py +++ b/main.py @@ -9,14 +9,13 @@ import discord from discord.ext import commands -import traceback import json import os import sys -import asyncio import aiohttp import rethinkdb -from typing import List, Optional +from typing import List +from extensions.models import SearchExceptions class Bot(commands.Bot): @@ -30,6 +29,7 @@ class Bot(commands.Bot): # Setup self.extensions_list: List[str] = [] + self.debug_toggle = False with open('config.json') as f: self.config = json.load(f) @@ -221,7 +221,7 @@ class Bot(commands.Bot): # Maintenance mode elif ( - self.maintenance + self.maintenance and not message.author.id == bot.appinfo.owner.id ): return @@ -249,13 +249,26 @@ async def on_command_error(ctx, error): """Handles all errors stemming from ext.commands.""" # Lets other cogs handle CommandNotFound. - # Change this if you want command not found handling + # Change this if you want command not found handling. if ( isinstance(error, commands.CommandNotFound) or isinstance(error, commands.CheckFailure) ): return + # Custom message for if an argument is missing. + elif isinstance(error, commands.MissingRequiredArgument): + await ctx.send( + f"**Missing Argument!** A `{error.param.name}` is needed." + ) + + elif isinstance(error, SearchExceptions.SafesearchFail): + await ctx.send( + "**Sorry!** That query included language " + "we cannot accept in a non-NSFW channel. " + "Please try again in an NSFW channel." + ) + # Provides a very pretty embed if something's actually a dev's fault. elif isinstance(error, commands.CommandInvokeError): @@ -284,4 +297,4 @@ async def on_command_error(ctx, error): # NOTE Bot Entry Point # Starts the bot print("Connecting...\n") -bot.run(bot.config['TOKEN']) \ No newline at end of file +bot.run(bot.config['TOKEN']) diff --git a/old_search.py b/old_search.py new file mode 100644 index 0000000..4f3ee95 --- /dev/null +++ b/old_search.py @@ -0,0 +1,211 @@ +# This is the old search logic for reference purposes + + async def _old_search_logic(self, query: str, is_nsfw: bool = False, + category: str = None) -> str: + """Provides search logic for all search commands.""" + + # NSFW Filtering + # WARNING - This list includes slurs. + nono_words = [ + 'tranny', 'faggot', 'fag', + 'porn', 'cock', 'dick', + 'titty', 'boob', 'penis', + 'slut', 'cum', 'jizz', + 'semen', 'cooch', 'coochie', + 'pussy', 'penis', 'fetish', + 'bdsm', 'sexy', 'xxx', + 'orgasm', 'masturbation', + 'erotic', 'creampie', + 'fap', 'nude', 'orgasm', + 'squirting', 'yiff', + 'e621' + ] + nono_sites = [ + 'xvideos', 'pornhub', + 'xhamster', 'xnxx', + 'youporn', 'xxx', + 'freexcafe', 'sex.com', + 'e621', 'nhentai' + ] + + if not is_nsfw: + for i in nono_words: + if i in query.replace(" ", ""): + return ( + "**Sorry!** That query included language " + "we cannot accept in a non-NSFW channel. " + "Please try again in an NSFW channel." + ) + + # Choose an instance + if self.instances == []: + with open('searxes.txt') as f: + self.instances = f.read().split('\n') + instance = random.sample(self.instances, k=1)[0] + + # Error Template + error_msg = ( + "**An error occured!**\n\n" + f"There was a problem with `{instance}`. Please try again later.\n" + f"_If problems with this instance persist, " + f"contact`{self.bot.appinfo.owner}` to have it removed._" + ) + + # Create the URL to make an API call to + call = f'{instance}search?q={query}&format=json&language=en-US' + + # If a type is provided, add that type to the call URL + if category: + call += f'&categories={category}' + + if is_nsfw: + call += '&safesearch=0' + else: + call += '&safesearch=1' + + # Figure out engines for different categories to get decent results. + if category == 'videos': + call += '&engines=bing+videos,google+videos' + # Make said API call + try: + async with self.request.get(call) as resp: + response = await resp.json() + except aiohttp.ClientError: + return error_msg + + # Split our response data up for parsing + # infoboxes = response['infoboxes'] + results = response['results'] + + # Create message with results + try: + # Handle tiny result count + if len(results) > 5: + amt = 5 + else: + amt = len(results) + + # Remove no-no sites + if not is_nsfw: + for r in results[0:7]: + for n in nono_sites: + if n in r['url']: + results.remove(r) + + # Escape stuff + query = discord.utils.escape_mentions(query) + query = discord.utils.escape_markdown(query) + + # Header + msg = f"Showing **{amt}** results for `{query}`. \n\n" + # Expanded Result + msg += ( + f"**{results[0]['title']}** <{results[0]['url']}>\n" + f"{results[0]['content']}\n\n") + # Other Results + msg += "\n".join( + [f"**{entry['title']}** <{entry['url']}>" for entry in results[1:5]]) + # Instance Info + msg += f"\n\n_Results retrieved from instance `{instance}`._" + + return msg + + # Reached if error with returned results + except (KeyError, IndexError) as e: + # Logging + await self.warn( + f"A user encountered a(n) `{e}` with <{instance}> when searching for `{query}`. " + "Consider removing it or looking into it.", + name="Failed Instance" + ) + + self.instances.remove(instance) # Weed the instance out + # Recurse until good response + return await self._old_search_logic(query, is_nsfw) + + async def _instance_check(self, instance: str, content: dict) -> bool: + """Checks the quality of an instance.""" + + # Makes sure proper values exist + if 'error' in content: + return False + if not ('engines' in content and 'initial' in content['timing']): + return False + if not ('google' in content['engines'] and 'enabled' in content['engines']['google']): + return False + + # Makes sure google is enabled + if not content['engines']['google']['enabled']: + return False + + # Makes sure is not Tor + if content['network_type'] != 'normal': + return False + + # Only picks instances that are fast enough + timing = int(content['timing']['initial']) + if timing > 0.20: + return False + + # Check for Google captcha + test_search = f'{instance}/search?q=test&format=json&lang=en-US' + try: + async with self.request.get(test_search) as resp: + response = await resp.json() + response['results'][0]['content'] + except (aiohttp.ClientError, KeyError, IndexError): + return False + + # Reached if passes all checks + return True + + @commands.command() + @commands.is_owner() + async def rejson(self, ctx): + """Refreshes the list of instances for searx.""" + + msg = await ctx.send(' Refreshing instance list...\n\n' + '(Due to extensive quality checks, this may take a bit.)') + plausible: List[str] = [] + + # Get, parse, and quality check all instances + async with self.request.get('https://searx.space/data/instances.json') as r: + # Parsing + searx_json = await r.json() + instances = searx_json['instances'] + + # Quality Check + for i in instances: + content = instances.get(i) + is_good: bool = await self._instance_check(i, content) + if is_good: + plausible.append(i) + + # Save new list + self.instances = plausible + with open('searxes.txt', 'w') as f: + f.write('\n'.join(plausible)) + + await msg.edit(content='Instances refreshed!') + + async def _old_basic_search(self, ctx, query: str, + category: str = None): + """Base search message generation.""" + + async with ctx.typing(): + is_nsfw = ( + ctx.channel.is_nsfw() if hasattr(ctx.channel, 'is_nsfw') + else False + ) + + msg = await self._old_search_logic(query, is_nsfw, category) + await ctx.send(msg) + + await self.info( + content=( + f"**{ctx.author}** searched for `{query}` " + f"in \"{ctx.guild}\" and got this:" + f"\n\n{msg}" + ), + name="Search Results" + ) \ No newline at end of file