Moving search to Qwant

This commit is contained in:
Adriene Hutchins 2020-03-24 19:09:01 -04:00
parent 19acae2abc
commit 391beaf422
6 changed files with 422 additions and 229 deletions

View file

@ -267,6 +267,14 @@ Guild count: {len(self.bot.guilds)}
# Message Sending # Message Sending
await ctx.send(msg) await ctx.send(msg)
@commands.command()
@commands.is_owner()
async def toggle_debug(self, ctx):
"""Toggles debug while running."""
self.bot.debug_toggle = not self.bot.debug_toggle
await ctx.send(f"Set debug mode to `{self.bot.debug_toggle}`.")
@commands.command(aliases=['exit', 'reboot']) @commands.command(aliases=['exit', 'reboot'])
@commands.is_owner() @commands.is_owner()
async def restart(self, ctx): async def restart(self, ctx):

View file

@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
# search exceptions
# Provides custom exceptions for the search cog.
"""Search Exceptions File"""
from discord.ext import commands
class SafesearchFail(commands.CommandError):
"""Thrown when a query contains NSFW content."""
pass
def setup(bot):
pass

View file

@ -10,6 +10,8 @@ from discord.ext import commands
import aiohttp import aiohttp
import random import random
from typing import List from typing import List
from extensions.models import SearchExceptions
import html2text
class Search(commands.Cog, name="Basic"): class Search(commands.Cog, name="Basic"):
@ -21,193 +23,153 @@ class Search(commands.Cog, name="Basic"):
self.bot = bot self.bot = bot
self.info = bot.logging.info self.info = bot.logging.info
self.warn = bot.logging.warn self.warn = bot.logging.warn
self.debug = bot.logging.debug
self.request = bot.request self.request = bot.request
self.emoji = "\U0001F50D" self.emoji = "\U0001F50D"
self.scrape_token = bot.config['SCRAPESTACK']
# Get Instances # Markdown converter
with open('searxes.txt') as f: self.tomd = html2text.HTML2Text()
self.instances = f.read().split('\n') self.tomd.ignore_links = True
self.tomd.ignore_images = True
self.tomd.ignore_tables = True
self.tomd.ignore_emphasis = True
self.tomd.body_width = 0
async def _search_logic(self, query: str, is_nsfw: bool = False, async def _search_logic(self, query: str, is_nsfw: bool = False,
category: str = None) -> str: category: str = 'web', count: int = 5) -> list:
"""Provides search logic for all search commands.""" """Uses scrapestack and the Qwant API to find search results."""
# Typing
base: str
safesearch: str
# NSFW Filtering # NSFW Filtering
# WARNING - This list includes slurs. # WARNING This list includes slurs.
nono_words = [ nono_words = [
'tranny', 'faggot', 'fag', 'tranny', 'faggot', 'fag', 'porn', 'cock', 'dick',
'porn', 'cock', 'dick', 'titty', ' tit ', 'boob', 'penis', 'slut', ' cum ', 'jizz',
'titty', 'boob', 'penis', 'semen', 'cooch', 'coochie', 'pussy', 'penis', 'fetish',
'slut', 'cum', 'jizz', 'bdsm', 'sexy', 'xxx', 'orgasm', 'masturbat',
'semen', 'cooch', 'coochie', 'erotic', 'creampie', 'fap', 'nude', 'orgasm',
'pussy', 'penis', 'fetish', 'squirting', 'yiff', 'e621', ' sex', 'ejaculat',
'bdsm', 'sexy', 'xxx', 'cunt', 'vagina', 'coom', 'troon', 'hentai', 'yaoi',
'orgasm', 'masturbation', 'bukkake', 'bara', 'shota', 'loli', 'fetish', 'spunk',
'erotic', 'creampie', 'pron', 'p0rn', 'pr0n', 'gloryhole', 'felch', 'skullfuck',
'fap', 'nude', 'orgasm', 'scat', 'pissplay', 'piss play', 'underage', 'bbw',
'squirting', 'yiff', 'fisting', 'queef', "rimming", 'rimjob', 'bdsm',
'e621' 'cbt', 'blumpkin', 'boner', 'prostitut', 'butt plug',
] 'transvestite', 'femboy', 'castrat', 'philia', 'edging',
nono_sites = [ 'edgeplay', 'enema', 'facial', 'fellat', 'femdom', 'footjob',
'xvideos', 'pornhub', 'blowjob', 'titjob', 'handjob', 'frot', 'gang bang', 'gangbang',
'xhamster', 'xnxx', 'glory hole', 'hermap', 'jerk off', 'jerking off', 'jack off',
'youporn', 'xxx', 'jacking off', 'kink', 'wet dream', 'anal', 'pegging', 'precum',
'freexcafe', 'sex.com', 'pre-cum', 'pre cum', 'priap', 'scrotum', 'shemale', 'smegma',
'e621' 'smut', 'softcore', 'transsexual', 'voyeur', 'viagra', 'wank',
'whore'
] ]
if not is_nsfw: if any(n in query for n in nono_words):
for i in nono_words: raise SearchExceptions.SafesearchFail('Query had NSFW.')
if i in query.replace(" ", ""):
return (
"**Sorry!** That query included language "
"we cannot accept in a non-NSFW channel. "
"Please try again in an NSFW channel."
)
# Choose an instance # Scrape or not
if self.instances == []: # if self.scrape_token != '':
with open('searxes.txt') as f: # base = (
self.instances = f.read().split('\n') # "http://api.scrapestack.com/scrape"
instance = random.sample(self.instances, k=1)[0] # f"?access_key={self.scrape_token}"
# f"&url=https://api.qwant.com/api"
# Error Template # )
error_msg = ( # print(base)
"**An error occured!**\n\n" # else:
f"There was a problem with `{instance}`. Please try again later.\n" base = "https://api.qwant.com/api"
f"_If problems with this instance persist, contact`{self.bot.appinfo.owner}` to have it removed._"
)
# Create the URL to make an API call to
call = f'{instance}search?q={query}&format=json&language=en-US'
# If a type is provided, add that type to the call URL
if category:
call += f'&categories={category}'
# Safesearch
if is_nsfw: if is_nsfw:
call += '&safesearch=0' safesearch = "0"
else: else:
call += '&safesearch=1' safesearch = "2"
# Figure out engines for different categories to get decent results. # Search URL Building
if category == 'videos': # api.qwant.com/api/search/web?count=5&q=test&safesearch=2&...
call += '&engines=bing+videos,google+videos' search_url = (
# Make said API call f"{base}/search/{category}"
try: f"?count={count}"
async with self.request.get(call) as resp: f"&q={query}"
response = await resp.json() f"&safesearch={safesearch}"
except aiohttp.ClientError: "&t=web"
return error_msg "&locale=en_US"
"&uiv=4"
# Split our response data up for parsing
# infoboxes = response['infoboxes']
results = response['results']
# Create message with results
try:
# Handle tiny result count
if len(results) > 5:
amt = 5
else:
amt = len(results)
# Remove no-no sites
if not is_nsfw:
for r in results[0:7]:
for n in nono_sites:
if n in r['url']:
results.remove(r)
# Escape stuff
query = discord.utils.escape_mentions(query)
query = discord.utils.escape_markdown(query)
# Header
msg = f"Showing **{amt}** results for `{query}`. \n\n"
# Expanded Result
msg += (
f"**{results[0]['title']}** <{results[0]['url']}>\n"
f"{results[0]['content']}\n\n")
# Other Results
msg += "\n".join(
[f"**{entry['title']}** <{entry['url']}>" for entry in results[1:5]])
# Instance Info
msg += f"\n\n_Results retrieved from instance `{instance}`._"
return msg
# Reached if error with returned results
except (KeyError, IndexError) as e:
# Logging
await self.warn(
f"A user encountered a(n) `{e}` with <{instance}> when searching for `{query}`. "
"Consider removing it or looking into it.",
name="Failed Instance"
) )
await self.debug(search_url, name="_search_logic")
self.instances.remove(instance) # Weed the instance out # Searching
# Recurse until good response headers = {
return await self._search_logic(query, is_nsfw) 'User-Agent': (
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:74.0)'
' Gecko/20100101 Firefox/74.0'
)
}
async with self.request.get(search_url, headers=headers) as resp:
to_parse = await resp.json()
async def _instance_check(self, instance: str, content: dict) -> bool: # Sends results
"""Checks the quality of an instance.""" return to_parse['data']['result']['items']
# Makes sure proper values exist async def _basic_search(self, ctx, query: str, category: str = 'web'):
if 'error' in content: """Basic search formatting."""
return False
if not ('engines' in content and 'initial' in content['timing']):
return False
if not ('google' in content['engines'] and 'enabled' in content['engines']['google']):
return False
# Makes sure google is enabled # NOTE Customizable count not yet implemented.
if not content['engines']['google']['enabled']: count: int = 5
return False
# Makes sure is not Tor # Safesearch variable
if content['network_type'] != 'normal':
return False
# Only picks instances that are fast enough
timing = int(content['timing']['initial'])
if timing > 0.20:
return False
# Check for Google captcha
test_search = f'{instance}/search?q=test&format=json&lang=en-US'
try:
async with self.request.get(test_search) as resp:
response = await resp.json()
response['results'][0]['content']
except (aiohttp.ClientError, KeyError, IndexError):
return False
# Reached if passes all checks
return True
async def _basic_search(self, ctx, query: str,
category: str = None):
"""Base search message generation."""
async with ctx.typing():
is_nsfw = ( is_nsfw = (
ctx.channel.is_nsfw() if hasattr(ctx.channel, 'is_nsfw') ctx.channel.is_nsfw() if hasattr(ctx.channel, 'is_nsfw')
else False else False
) )
msg = await self._search_logic(query, is_nsfw, category) # Handling
await ctx.send(msg) async with ctx.typing():
await self.info( # Searches
content=( results = await self._search_logic(query, is_nsfw, category)
f"**{ctx.author}** searched for `{query}` " count = len(results)
f"in \"{ctx.guild}\" and got this:"
f"\n\n{msg}" # Escapes all nasties for displaying
), query_display = discord.utils.escape_mentions(query)
name="Search Results" query_display = discord.utils.escape_markdown(query_display)
# Return if no results
try:
results[0]
except IndexError:
return await ctx.send(
f"No results found for `{query_display}`."
) )
# Gets the first entry's stuff
first_title = self.tomd.handle(results[0]['title']).rstrip('\n')
first_url = results[0]['url']
first_desc = self.tomd.handle(results[0]['desc']).rstrip('\n')
# Builds the substring for each of the other results.
other_results: List[str] = []
for r in results[1:count]:
title = self.tomd.handle(r['title']).rstrip('\n')
url = results[0]['url']
other_results.append(f"**{title}** <{url}>")
other_msg: str = "\n".join(other_results)
# Builds message
msg = (
f"Showing **{count}** results for `{query_display}`.\n\n"
f"**{first_title}** <{first_url}>\n{first_desc}\n\n"
f"{other_msg}\n\n_Powered by Qwant._"
)
# Sends message
await self.debug(msg, name="_basic_search")
await ctx.send(msg)
@commands.command() @commands.command()
async def search(self, ctx, *, query: str): async def search(self, ctx, *, query: str):
"""Search online for general results.""" """Search online for general results."""
@ -250,51 +212,27 @@ class Search(commands.Cog, name="Basic"):
await self._basic_search(ctx, query, 'maps') await self._basic_search(ctx, query, 'maps')
@commands.command()
@commands.is_owner()
async def rejson(self, ctx):
"""Refreshes the list of instances for searx."""
msg = await ctx.send('<a:updating:403035325242540032> Refreshing instance list...\n\n'
'(Due to extensive quality checks, this may take a bit.)')
plausible: List[str] = []
# Get, parse, and quality check all instances
async with self.request.get('https://searx.space/data/instances.json') as r:
# Parsing
searx_json = await r.json()
instances = searx_json['instances']
# Quality Check
for i in instances:
content = instances.get(i)
is_good: bool = await self._instance_check(i, content)
if is_good:
plausible.append(i)
# Save new list
self.instances = plausible
with open('searxes.txt', 'w') as f:
f.write('\n'.join(plausible))
await msg.edit(content='Instances refreshed!')
@commands.Cog.listener() @commands.Cog.listener()
async def on_command_error(self, ctx, error): async def on_command_error(self, ctx, error):
"""Listener makes no command fallback to searching.""" """Listener makes no command fallback to searching."""
if isinstance(error, commands.CommandNotFound) or \ fallback = (commands.CommandNotFound, commands.CheckFailure)
isinstance(error, commands.CheckFailure):
# Handling if isinstance(error, fallback):
async with ctx.typing(): try:
# Prepares term await self._basic_search(
term = ctx.message.content.replace(ctx.prefix, '', 1) ctx, ctx.message.content[len(ctx.prefix):]
term = term.lstrip(' ') )
except SearchExceptions.SafesearchFail:
# Does search await ctx.send(
await self._basic_search(ctx, term) "**Sorry!** That query included language "
"we cannot accept in a non-NSFW channel. "
"Please try again in an NSFW channel."
)
except Exception as e:
print(e)
def setup(bot): def setup(bot):
bot.add_cog(Search(bot)) bot.add_cog(Search(bot))

View file

@ -19,6 +19,7 @@ class Logging():
self.request = bot.request self.request = bot.request
self.online = bot.online self.online = bot.online
self.maintenance = bot.maintenance self.maintenance = bot.maintenance
self.debug_toggle = bot.debug_toggle
# Sets info hook first # Sets info hook first
self.info_hook = self.online.get_webhook( self.info_hook = self.online.get_webhook(
@ -26,7 +27,6 @@ class Logging():
if bot.config['HOOKS']['INFO_HOOK'] \ if bot.config['HOOKS']['INFO_HOOK'] \
else None else None
# Sets other hooks or defaults them # Sets other hooks or defaults them
if self.info_hook: if self.info_hook:
self.warn_hook = self.online.get_webhook( self.warn_hook = self.online.get_webhook(
@ -51,7 +51,7 @@ class Logging():
# Prerequisites # Prerequisites
formatted_tb = traceback.format_tb(error.__traceback__) formatted_tb = traceback.format_tb(error.__traceback__)
formatted_tb = ''.join(formatted_tb) tb_str = ''.join(formatted_tb)
original_exc = traceback.format_exception( original_exc = traceback.format_exception(
type(error), error, error.__traceback__) type(error), error, error.__traceback__)
@ -77,7 +77,7 @@ class Logging():
trace_content = ( trace_content = (
"```py\n\nTraceback (most recent call last):" "```py\n\nTraceback (most recent call last):"
"\n{}{}: {}```").format( "\n{}{}: {}```").format(
formatted_tb, tb_str,
type(error).__name__, type(error).__name__,
error) error)
@ -120,7 +120,9 @@ class Logging():
if self.warn_hook: if self.warn_hook:
return await self.warn_hook.send( return await self.warn_hook.send(
content=content, content=content,
username=f"{self.bot.user.name} - {name if name else 'unknown'}", username=(
f"{self.bot.user.name} - {name if name else 'unknown'}"
),
avatar_url=str(self.bot.user.avatar_url), avatar_url=str(self.bot.user.avatar_url),
embed=embed embed=embed
) )
@ -144,7 +146,9 @@ class Logging():
) )
await self.error_hook.send( await self.error_hook.send(
content=fallback, content=fallback,
username=f"{self.bot.user.name} - {name if name else 'unknown'}", username=(
f"{self.bot.user.name} - {name if name else 'unknown'}"
),
avatar_url=str(self.bot.user.avatar_url), avatar_url=str(self.bot.user.avatar_url),
embed=error_embed embed=error_embed
) )
@ -168,10 +172,12 @@ class Logging():
name: Optional[str] = None): name: Optional[str] = None):
"""Logs warnings and sends them to the appropriate places.""" """Logs warnings and sends them to the appropriate places."""
if self.debug_hook and self.maintenance: if self.debug_hook and (self.maintenance or self.debug_toggle):
return await self.debug_hook.send( return await self.debug_hook.send(
content=content, content=f"```{content}```",
username=f"{self.bot.user.name} - {name if name else 'unknown'}", username=(
f"{self.bot.user.name} - {name if name else 'unknown'}"
),
avatar_url=str(self.bot.user.avatar_url), avatar_url=str(self.bot.user.avatar_url),
embed=embed embed=embed
) )

21
main.py
View file

@ -9,14 +9,13 @@
import discord import discord
from discord.ext import commands from discord.ext import commands
import traceback
import json import json
import os import os
import sys import sys
import asyncio
import aiohttp import aiohttp
import rethinkdb import rethinkdb
from typing import List, Optional from typing import List
from extensions.models import SearchExceptions
class Bot(commands.Bot): class Bot(commands.Bot):
@ -30,6 +29,7 @@ class Bot(commands.Bot):
# Setup # Setup
self.extensions_list: List[str] = [] self.extensions_list: List[str] = []
self.debug_toggle = False
with open('config.json') as f: with open('config.json') as f:
self.config = json.load(f) self.config = json.load(f)
@ -249,13 +249,26 @@ async def on_command_error(ctx, error):
"""Handles all errors stemming from ext.commands.""" """Handles all errors stemming from ext.commands."""
# Lets other cogs handle CommandNotFound. # Lets other cogs handle CommandNotFound.
# Change this if you want command not found handling # Change this if you want command not found handling.
if ( if (
isinstance(error, commands.CommandNotFound) isinstance(error, commands.CommandNotFound)
or isinstance(error, commands.CheckFailure) or isinstance(error, commands.CheckFailure)
): ):
return return
# Custom message for if an argument is missing.
elif isinstance(error, commands.MissingRequiredArgument):
await ctx.send(
f"**Missing Argument!** A `{error.param.name}` is needed."
)
elif isinstance(error, SearchExceptions.SafesearchFail):
await ctx.send(
"**Sorry!** That query included language "
"we cannot accept in a non-NSFW channel. "
"Please try again in an NSFW channel."
)
# Provides a very pretty embed if something's actually a dev's fault. # Provides a very pretty embed if something's actually a dev's fault.
elif isinstance(error, commands.CommandInvokeError): elif isinstance(error, commands.CommandInvokeError):

211
old_search.py Normal file
View file

@ -0,0 +1,211 @@
# This is the old search logic for reference purposes
async def _old_search_logic(self, query: str, is_nsfw: bool = False,
category: str = None) -> str:
"""Provides search logic for all search commands."""
# NSFW Filtering
# WARNING - This list includes slurs.
nono_words = [
'tranny', 'faggot', 'fag',
'porn', 'cock', 'dick',
'titty', 'boob', 'penis',
'slut', 'cum', 'jizz',
'semen', 'cooch', 'coochie',
'pussy', 'penis', 'fetish',
'bdsm', 'sexy', 'xxx',
'orgasm', 'masturbation',
'erotic', 'creampie',
'fap', 'nude', 'orgasm',
'squirting', 'yiff',
'e621'
]
nono_sites = [
'xvideos', 'pornhub',
'xhamster', 'xnxx',
'youporn', 'xxx',
'freexcafe', 'sex.com',
'e621', 'nhentai'
]
if not is_nsfw:
for i in nono_words:
if i in query.replace(" ", ""):
return (
"**Sorry!** That query included language "
"we cannot accept in a non-NSFW channel. "
"Please try again in an NSFW channel."
)
# Choose an instance
if self.instances == []:
with open('searxes.txt') as f:
self.instances = f.read().split('\n')
instance = random.sample(self.instances, k=1)[0]
# Error Template
error_msg = (
"**An error occured!**\n\n"
f"There was a problem with `{instance}`. Please try again later.\n"
f"_If problems with this instance persist, "
f"contact`{self.bot.appinfo.owner}` to have it removed._"
)
# Create the URL to make an API call to
call = f'{instance}search?q={query}&format=json&language=en-US'
# If a type is provided, add that type to the call URL
if category:
call += f'&categories={category}'
if is_nsfw:
call += '&safesearch=0'
else:
call += '&safesearch=1'
# Figure out engines for different categories to get decent results.
if category == 'videos':
call += '&engines=bing+videos,google+videos'
# Make said API call
try:
async with self.request.get(call) as resp:
response = await resp.json()
except aiohttp.ClientError:
return error_msg
# Split our response data up for parsing
# infoboxes = response['infoboxes']
results = response['results']
# Create message with results
try:
# Handle tiny result count
if len(results) > 5:
amt = 5
else:
amt = len(results)
# Remove no-no sites
if not is_nsfw:
for r in results[0:7]:
for n in nono_sites:
if n in r['url']:
results.remove(r)
# Escape stuff
query = discord.utils.escape_mentions(query)
query = discord.utils.escape_markdown(query)
# Header
msg = f"Showing **{amt}** results for `{query}`. \n\n"
# Expanded Result
msg += (
f"**{results[0]['title']}** <{results[0]['url']}>\n"
f"{results[0]['content']}\n\n")
# Other Results
msg += "\n".join(
[f"**{entry['title']}** <{entry['url']}>" for entry in results[1:5]])
# Instance Info
msg += f"\n\n_Results retrieved from instance `{instance}`._"
return msg
# Reached if error with returned results
except (KeyError, IndexError) as e:
# Logging
await self.warn(
f"A user encountered a(n) `{e}` with <{instance}> when searching for `{query}`. "
"Consider removing it or looking into it.",
name="Failed Instance"
)
self.instances.remove(instance) # Weed the instance out
# Recurse until good response
return await self._old_search_logic(query, is_nsfw)
async def _instance_check(self, instance: str, content: dict) -> bool:
"""Checks the quality of an instance."""
# Makes sure proper values exist
if 'error' in content:
return False
if not ('engines' in content and 'initial' in content['timing']):
return False
if not ('google' in content['engines'] and 'enabled' in content['engines']['google']):
return False
# Makes sure google is enabled
if not content['engines']['google']['enabled']:
return False
# Makes sure is not Tor
if content['network_type'] != 'normal':
return False
# Only picks instances that are fast enough
timing = int(content['timing']['initial'])
if timing > 0.20:
return False
# Check for Google captcha
test_search = f'{instance}/search?q=test&format=json&lang=en-US'
try:
async with self.request.get(test_search) as resp:
response = await resp.json()
response['results'][0]['content']
except (aiohttp.ClientError, KeyError, IndexError):
return False
# Reached if passes all checks
return True
@commands.command()
@commands.is_owner()
async def rejson(self, ctx):
"""Refreshes the list of instances for searx."""
msg = await ctx.send('<a:updating:403035325242540032> Refreshing instance list...\n\n'
'(Due to extensive quality checks, this may take a bit.)')
plausible: List[str] = []
# Get, parse, and quality check all instances
async with self.request.get('https://searx.space/data/instances.json') as r:
# Parsing
searx_json = await r.json()
instances = searx_json['instances']
# Quality Check
for i in instances:
content = instances.get(i)
is_good: bool = await self._instance_check(i, content)
if is_good:
plausible.append(i)
# Save new list
self.instances = plausible
with open('searxes.txt', 'w') as f:
f.write('\n'.join(plausible))
await msg.edit(content='Instances refreshed!')
async def _old_basic_search(self, ctx, query: str,
category: str = None):
"""Base search message generation."""
async with ctx.typing():
is_nsfw = (
ctx.channel.is_nsfw() if hasattr(ctx.channel, 'is_nsfw')
else False
)
msg = await self._old_search_logic(query, is_nsfw, category)
await ctx.send(msg)
await self.info(
content=(
f"**{ctx.author}** searched for `{query}` "
f"in \"{ctx.guild}\" and got this:"
f"\n\n{msg}"
),
name="Search Results"
)