Moving search to Qwant

This commit is contained in:
Adriene Hutchins 2020-03-24 19:09:01 -04:00
parent 19acae2abc
commit 391beaf422
6 changed files with 422 additions and 229 deletions

View file

@ -267,6 +267,14 @@ Guild count: {len(self.bot.guilds)}
# Message Sending
await ctx.send(msg)
@commands.command()
@commands.is_owner()
async def toggle_debug(self, ctx):
"""Toggles debug while running."""
self.bot.debug_toggle = not self.bot.debug_toggle
await ctx.send(f"Set debug mode to `{self.bot.debug_toggle}`.")
@commands.command(aliases=['exit', 'reboot'])
@commands.is_owner()
async def restart(self, ctx):

View file

@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
# search exceptions
# Provides custom exceptions for the search cog.
"""Search Exceptions File"""
from discord.ext import commands
class SafesearchFail(commands.CommandError):
"""Thrown when a query contains NSFW content."""
pass
def setup(bot):
pass

View file

@ -10,6 +10,8 @@ from discord.ext import commands
import aiohttp
import random
from typing import List
from extensions.models import SearchExceptions
import html2text
class Search(commands.Cog, name="Basic"):
@ -21,193 +23,153 @@ class Search(commands.Cog, name="Basic"):
self.bot = bot
self.info = bot.logging.info
self.warn = bot.logging.warn
self.debug = bot.logging.debug
self.request = bot.request
self.emoji = "\U0001F50D"
self.scrape_token = bot.config['SCRAPESTACK']
# Get Instances
with open('searxes.txt') as f:
self.instances = f.read().split('\n')
# Markdown converter
self.tomd = html2text.HTML2Text()
self.tomd.ignore_links = True
self.tomd.ignore_images = True
self.tomd.ignore_tables = True
self.tomd.ignore_emphasis = True
self.tomd.body_width = 0
async def _search_logic(self, query: str, is_nsfw: bool = False,
category: str = None) -> str:
"""Provides search logic for all search commands."""
category: str = 'web', count: int = 5) -> list:
"""Uses scrapestack and the Qwant API to find search results."""
# Typing
base: str
safesearch: str
# NSFW Filtering
# WARNING - This list includes slurs.
# WARNING This list includes slurs.
nono_words = [
'tranny', 'faggot', 'fag',
'porn', 'cock', 'dick',
'titty', 'boob', 'penis',
'slut', 'cum', 'jizz',
'semen', 'cooch', 'coochie',
'pussy', 'penis', 'fetish',
'bdsm', 'sexy', 'xxx',
'orgasm', 'masturbation',
'erotic', 'creampie',
'fap', 'nude', 'orgasm',
'squirting', 'yiff',
'e621'
]
nono_sites = [
'xvideos', 'pornhub',
'xhamster', 'xnxx',
'youporn', 'xxx',
'freexcafe', 'sex.com',
'e621'
'tranny', 'faggot', 'fag', 'porn', 'cock', 'dick',
'titty', ' tit ', 'boob', 'penis', 'slut', ' cum ', 'jizz',
'semen', 'cooch', 'coochie', 'pussy', 'penis', 'fetish',
'bdsm', 'sexy', 'xxx', 'orgasm', 'masturbat',
'erotic', 'creampie', 'fap', 'nude', 'orgasm',
'squirting', 'yiff', 'e621', ' sex', 'ejaculat',
'cunt', 'vagina', 'coom', 'troon', 'hentai', 'yaoi',
'bukkake', 'bara', 'shota', 'loli', 'fetish', 'spunk',
'pron', 'p0rn', 'pr0n', 'gloryhole', 'felch', 'skullfuck',
'scat', 'pissplay', 'piss play', 'underage', 'bbw',
'fisting', 'queef', "rimming", 'rimjob', 'bdsm',
'cbt', 'blumpkin', 'boner', 'prostitut', 'butt plug',
'transvestite', 'femboy', 'castrat', 'philia', 'edging',
'edgeplay', 'enema', 'facial', 'fellat', 'femdom', 'footjob',
'blowjob', 'titjob', 'handjob', 'frot', 'gang bang', 'gangbang',
'glory hole', 'hermap', 'jerk off', 'jerking off', 'jack off',
'jacking off', 'kink', 'wet dream', 'anal', 'pegging', 'precum',
'pre-cum', 'pre cum', 'priap', 'scrotum', 'shemale', 'smegma',
'smut', 'softcore', 'transsexual', 'voyeur', 'viagra', 'wank',
'whore'
]
if not is_nsfw:
for i in nono_words:
if i in query.replace(" ", ""):
return (
"**Sorry!** That query included language "
"we cannot accept in a non-NSFW channel. "
"Please try again in an NSFW channel."
)
if any(n in query for n in nono_words):
raise SearchExceptions.SafesearchFail('Query had NSFW.')
# Choose an instance
if self.instances == []:
with open('searxes.txt') as f:
self.instances = f.read().split('\n')
instance = random.sample(self.instances, k=1)[0]
# Scrape or not
# if self.scrape_token != '':
# base = (
# "http://api.scrapestack.com/scrape"
# f"?access_key={self.scrape_token}"
# f"&url=https://api.qwant.com/api"
# )
# print(base)
# else:
base = "https://api.qwant.com/api"
# Error Template
error_msg = (
"**An error occured!**\n\n"
f"There was a problem with `{instance}`. Please try again later.\n"
f"_If problems with this instance persist, contact`{self.bot.appinfo.owner}` to have it removed._"
# Safesearch
if is_nsfw:
safesearch = "0"
else:
safesearch = "2"
# Search URL Building
# api.qwant.com/api/search/web?count=5&q=test&safesearch=2&...
search_url = (
f"{base}/search/{category}"
f"?count={count}"
f"&q={query}"
f"&safesearch={safesearch}"
"&t=web"
"&locale=en_US"
"&uiv=4"
)
await self.debug(search_url, name="_search_logic")
# Searching
headers = {
'User-Agent': (
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:74.0)'
' Gecko/20100101 Firefox/74.0'
)
}
async with self.request.get(search_url, headers=headers) as resp:
to_parse = await resp.json()
# Sends results
return to_parse['data']['result']['items']
async def _basic_search(self, ctx, query: str, category: str = 'web'):
"""Basic search formatting."""
# NOTE Customizable count not yet implemented.
count: int = 5
# Safesearch variable
is_nsfw = (
ctx.channel.is_nsfw() if hasattr(ctx.channel, 'is_nsfw')
else False
)
# Create the URL to make an API call to
call = f'{instance}search?q={query}&format=json&language=en-US'
# If a type is provided, add that type to the call URL
if category:
call += f'&categories={category}'
if is_nsfw:
call += '&safesearch=0'
else:
call += '&safesearch=1'
# Figure out engines for different categories to get decent results.
if category == 'videos':
call += '&engines=bing+videos,google+videos'
# Make said API call
try:
async with self.request.get(call) as resp:
response = await resp.json()
except aiohttp.ClientError:
return error_msg
# Split our response data up for parsing
# infoboxes = response['infoboxes']
results = response['results']
# Create message with results
try:
# Handle tiny result count
if len(results) > 5:
amt = 5
else:
amt = len(results)
# Remove no-no sites
if not is_nsfw:
for r in results[0:7]:
for n in nono_sites:
if n in r['url']:
results.remove(r)
# Escape stuff
query = discord.utils.escape_mentions(query)
query = discord.utils.escape_markdown(query)
# Header
msg = f"Showing **{amt}** results for `{query}`. \n\n"
# Expanded Result
msg += (
f"**{results[0]['title']}** <{results[0]['url']}>\n"
f"{results[0]['content']}\n\n")
# Other Results
msg += "\n".join(
[f"**{entry['title']}** <{entry['url']}>" for entry in results[1:5]])
# Instance Info
msg += f"\n\n_Results retrieved from instance `{instance}`._"
return msg
# Reached if error with returned results
except (KeyError, IndexError) as e:
# Logging
await self.warn(
f"A user encountered a(n) `{e}` with <{instance}> when searching for `{query}`. "
"Consider removing it or looking into it.",
name="Failed Instance"
)
self.instances.remove(instance) # Weed the instance out
# Recurse until good response
return await self._search_logic(query, is_nsfw)
async def _instance_check(self, instance: str, content: dict) -> bool:
"""Checks the quality of an instance."""
# Makes sure proper values exist
if 'error' in content:
return False
if not ('engines' in content and 'initial' in content['timing']):
return False
if not ('google' in content['engines'] and 'enabled' in content['engines']['google']):
return False
# Makes sure google is enabled
if not content['engines']['google']['enabled']:
return False
# Makes sure is not Tor
if content['network_type'] != 'normal':
return False
# Only picks instances that are fast enough
timing = int(content['timing']['initial'])
if timing > 0.20:
return False
# Check for Google captcha
test_search = f'{instance}/search?q=test&format=json&lang=en-US'
try:
async with self.request.get(test_search) as resp:
response = await resp.json()
response['results'][0]['content']
except (aiohttp.ClientError, KeyError, IndexError):
return False
# Reached if passes all checks
return True
async def _basic_search(self, ctx, query: str,
category: str = None):
"""Base search message generation."""
# Handling
async with ctx.typing():
is_nsfw = (
ctx.channel.is_nsfw() if hasattr(ctx.channel, 'is_nsfw')
else False
# Searches
results = await self._search_logic(query, is_nsfw, category)
count = len(results)
# Escapes all nasties for displaying
query_display = discord.utils.escape_mentions(query)
query_display = discord.utils.escape_markdown(query_display)
# Return if no results
try:
results[0]
except IndexError:
return await ctx.send(
f"No results found for `{query_display}`."
)
# Gets the first entry's stuff
first_title = self.tomd.handle(results[0]['title']).rstrip('\n')
first_url = results[0]['url']
first_desc = self.tomd.handle(results[0]['desc']).rstrip('\n')
# Builds the substring for each of the other results.
other_results: List[str] = []
for r in results[1:count]:
title = self.tomd.handle(r['title']).rstrip('\n')
url = results[0]['url']
other_results.append(f"**{title}** <{url}>")
other_msg: str = "\n".join(other_results)
# Builds message
msg = (
f"Showing **{count}** results for `{query_display}`.\n\n"
f"**{first_title}** <{first_url}>\n{first_desc}\n\n"
f"{other_msg}\n\n_Powered by Qwant._"
)
msg = await self._search_logic(query, is_nsfw, category)
# Sends message
await self.debug(msg, name="_basic_search")
await ctx.send(msg)
await self.info(
content=(
f"**{ctx.author}** searched for `{query}` "
f"in \"{ctx.guild}\" and got this:"
f"\n\n{msg}"
),
name="Search Results"
)
@commands.command()
async def search(self, ctx, *, query: str):
"""Search online for general results."""
@ -250,51 +212,27 @@ class Search(commands.Cog, name="Basic"):
await self._basic_search(ctx, query, 'maps')
@commands.command()
@commands.is_owner()
async def rejson(self, ctx):
"""Refreshes the list of instances for searx."""
msg = await ctx.send('<a:updating:403035325242540032> Refreshing instance list...\n\n'
'(Due to extensive quality checks, this may take a bit.)')
plausible: List[str] = []
# Get, parse, and quality check all instances
async with self.request.get('https://searx.space/data/instances.json') as r:
# Parsing
searx_json = await r.json()
instances = searx_json['instances']
# Quality Check
for i in instances:
content = instances.get(i)
is_good: bool = await self._instance_check(i, content)
if is_good:
plausible.append(i)
# Save new list
self.instances = plausible
with open('searxes.txt', 'w') as f:
f.write('\n'.join(plausible))
await msg.edit(content='Instances refreshed!')
@commands.Cog.listener()
async def on_command_error(self, ctx, error):
"""Listener makes no command fallback to searching."""
if isinstance(error, commands.CommandNotFound) or \
isinstance(error, commands.CheckFailure):
fallback = (commands.CommandNotFound, commands.CheckFailure)
# Handling
async with ctx.typing():
# Prepares term
term = ctx.message.content.replace(ctx.prefix, '', 1)
term = term.lstrip(' ')
# Does search
await self._basic_search(ctx, term)
if isinstance(error, fallback):
try:
await self._basic_search(
ctx, ctx.message.content[len(ctx.prefix):]
)
except SearchExceptions.SafesearchFail:
await ctx.send(
"**Sorry!** That query included language "
"we cannot accept in a non-NSFW channel. "
"Please try again in an NSFW channel."
)
except Exception as e:
print(e)
def setup(bot):
bot.add_cog(Search(bot))

View file

@ -19,13 +19,13 @@ class Logging():
self.request = bot.request
self.online = bot.online
self.maintenance = bot.maintenance
self.debug_toggle = bot.debug_toggle
# Sets info hook first
self.info_hook = self.online.get_webhook(
bot.config['HOOKS']['INFO_HOOK']) \
if bot.config['HOOKS']['INFO_HOOK'] \
else None
# Sets other hooks or defaults them
if self.info_hook:
@ -51,7 +51,7 @@ class Logging():
# Prerequisites
formatted_tb = traceback.format_tb(error.__traceback__)
formatted_tb = ''.join(formatted_tb)
tb_str = ''.join(formatted_tb)
original_exc = traceback.format_exception(
type(error), error, error.__traceback__)
@ -77,7 +77,7 @@ class Logging():
trace_content = (
"```py\n\nTraceback (most recent call last):"
"\n{}{}: {}```").format(
formatted_tb,
tb_str,
type(error).__name__,
error)
@ -95,8 +95,8 @@ class Logging():
# Provides completed embed
return error_embed
async def info(self, content: str,
embed: Optional[discord.Embed] = None,
async def info(self, content: str,
embed: Optional[discord.Embed] = None,
name: Optional[str] = None):
"""Logs info and sends it to the appropriate places."""
@ -112,15 +112,17 @@ class Logging():
else:
return
async def warn(self, content: str,
embed: Optional[discord.Embed] = None,
name: Optional[str] = None):
async def warn(self, content: str,
embed: Optional[discord.Embed] = None,
name: Optional[str] = None):
"""Logs warnings and sends them to the appropriate places."""
if self.warn_hook:
return await self.warn_hook.send(
content=content,
username=f"{self.bot.user.name} - {name if name else 'unknown'}",
username=(
f"{self.bot.user.name} - {name if name else 'unknown'}"
),
avatar_url=str(self.bot.user.avatar_url),
embed=embed
)
@ -129,7 +131,7 @@ class Logging():
async def error(self, error: Exception, ctx: Context, name: Optional[str]):
"""Logs errors and sends them to the appropriate places."""
# Prerequisites
error_embed = await self._create_error_embed(error, ctx)
@ -144,7 +146,9 @@ class Logging():
)
await self.error_hook.send(
content=fallback,
username=f"{self.bot.user.name} - {name if name else 'unknown'}",
username=(
f"{self.bot.user.name} - {name if name else 'unknown'}"
),
avatar_url=str(self.bot.user.avatar_url),
embed=error_embed
)
@ -163,15 +167,17 @@ class Logging():
)
return error_embed
async def debug(self, content: str,
embed: Optional[discord.Embed] = None,
async def debug(self, content: str,
embed: Optional[discord.Embed] = None,
name: Optional[str] = None):
"""Logs warnings and sends them to the appropriate places."""
if self.debug_hook and self.maintenance:
if self.debug_hook and (self.maintenance or self.debug_toggle):
return await self.debug_hook.send(
content=content,
username=f"{self.bot.user.name} - {name if name else 'unknown'}",
content=f"```{content}```",
username=(
f"{self.bot.user.name} - {name if name else 'unknown'}"
),
avatar_url=str(self.bot.user.avatar_url),
embed=embed
)

25
main.py
View file

@ -9,14 +9,13 @@
import discord
from discord.ext import commands
import traceback
import json
import os
import sys
import asyncio
import aiohttp
import rethinkdb
from typing import List, Optional
from typing import List
from extensions.models import SearchExceptions
class Bot(commands.Bot):
@ -30,6 +29,7 @@ class Bot(commands.Bot):
# Setup
self.extensions_list: List[str] = []
self.debug_toggle = False
with open('config.json') as f:
self.config = json.load(f)
@ -221,7 +221,7 @@ class Bot(commands.Bot):
# Maintenance mode
elif (
self.maintenance
self.maintenance
and not message.author.id == bot.appinfo.owner.id
):
return
@ -249,13 +249,26 @@ async def on_command_error(ctx, error):
"""Handles all errors stemming from ext.commands."""
# Lets other cogs handle CommandNotFound.
# Change this if you want command not found handling
# Change this if you want command not found handling.
if (
isinstance(error, commands.CommandNotFound)
or isinstance(error, commands.CheckFailure)
):
return
# Custom message for if an argument is missing.
elif isinstance(error, commands.MissingRequiredArgument):
await ctx.send(
f"**Missing Argument!** A `{error.param.name}` is needed."
)
elif isinstance(error, SearchExceptions.SafesearchFail):
await ctx.send(
"**Sorry!** That query included language "
"we cannot accept in a non-NSFW channel. "
"Please try again in an NSFW channel."
)
# Provides a very pretty embed if something's actually a dev's fault.
elif isinstance(error, commands.CommandInvokeError):
@ -284,4 +297,4 @@ async def on_command_error(ctx, error):
# NOTE Bot Entry Point
# Starts the bot
print("Connecting...\n")
bot.run(bot.config['TOKEN'])
bot.run(bot.config['TOKEN'])

211
old_search.py Normal file
View file

@ -0,0 +1,211 @@
# This is the old search logic for reference purposes
async def _old_search_logic(self, query: str, is_nsfw: bool = False,
category: str = None) -> str:
"""Provides search logic for all search commands."""
# NSFW Filtering
# WARNING - This list includes slurs.
nono_words = [
'tranny', 'faggot', 'fag',
'porn', 'cock', 'dick',
'titty', 'boob', 'penis',
'slut', 'cum', 'jizz',
'semen', 'cooch', 'coochie',
'pussy', 'penis', 'fetish',
'bdsm', 'sexy', 'xxx',
'orgasm', 'masturbation',
'erotic', 'creampie',
'fap', 'nude', 'orgasm',
'squirting', 'yiff',
'e621'
]
nono_sites = [
'xvideos', 'pornhub',
'xhamster', 'xnxx',
'youporn', 'xxx',
'freexcafe', 'sex.com',
'e621', 'nhentai'
]
if not is_nsfw:
for i in nono_words:
if i in query.replace(" ", ""):
return (
"**Sorry!** That query included language "
"we cannot accept in a non-NSFW channel. "
"Please try again in an NSFW channel."
)
# Choose an instance
if self.instances == []:
with open('searxes.txt') as f:
self.instances = f.read().split('\n')
instance = random.sample(self.instances, k=1)[0]
# Error Template
error_msg = (
"**An error occured!**\n\n"
f"There was a problem with `{instance}`. Please try again later.\n"
f"_If problems with this instance persist, "
f"contact`{self.bot.appinfo.owner}` to have it removed._"
)
# Create the URL to make an API call to
call = f'{instance}search?q={query}&format=json&language=en-US'
# If a type is provided, add that type to the call URL
if category:
call += f'&categories={category}'
if is_nsfw:
call += '&safesearch=0'
else:
call += '&safesearch=1'
# Figure out engines for different categories to get decent results.
if category == 'videos':
call += '&engines=bing+videos,google+videos'
# Make said API call
try:
async with self.request.get(call) as resp:
response = await resp.json()
except aiohttp.ClientError:
return error_msg
# Split our response data up for parsing
# infoboxes = response['infoboxes']
results = response['results']
# Create message with results
try:
# Handle tiny result count
if len(results) > 5:
amt = 5
else:
amt = len(results)
# Remove no-no sites
if not is_nsfw:
for r in results[0:7]:
for n in nono_sites:
if n in r['url']:
results.remove(r)
# Escape stuff
query = discord.utils.escape_mentions(query)
query = discord.utils.escape_markdown(query)
# Header
msg = f"Showing **{amt}** results for `{query}`. \n\n"
# Expanded Result
msg += (
f"**{results[0]['title']}** <{results[0]['url']}>\n"
f"{results[0]['content']}\n\n")
# Other Results
msg += "\n".join(
[f"**{entry['title']}** <{entry['url']}>" for entry in results[1:5]])
# Instance Info
msg += f"\n\n_Results retrieved from instance `{instance}`._"
return msg
# Reached if error with returned results
except (KeyError, IndexError) as e:
# Logging
await self.warn(
f"A user encountered a(n) `{e}` with <{instance}> when searching for `{query}`. "
"Consider removing it or looking into it.",
name="Failed Instance"
)
self.instances.remove(instance) # Weed the instance out
# Recurse until good response
return await self._old_search_logic(query, is_nsfw)
async def _instance_check(self, instance: str, content: dict) -> bool:
"""Checks the quality of an instance."""
# Makes sure proper values exist
if 'error' in content:
return False
if not ('engines' in content and 'initial' in content['timing']):
return False
if not ('google' in content['engines'] and 'enabled' in content['engines']['google']):
return False
# Makes sure google is enabled
if not content['engines']['google']['enabled']:
return False
# Makes sure is not Tor
if content['network_type'] != 'normal':
return False
# Only picks instances that are fast enough
timing = int(content['timing']['initial'])
if timing > 0.20:
return False
# Check for Google captcha
test_search = f'{instance}/search?q=test&format=json&lang=en-US'
try:
async with self.request.get(test_search) as resp:
response = await resp.json()
response['results'][0]['content']
except (aiohttp.ClientError, KeyError, IndexError):
return False
# Reached if passes all checks
return True
@commands.command()
@commands.is_owner()
async def rejson(self, ctx):
"""Refreshes the list of instances for searx."""
msg = await ctx.send('<a:updating:403035325242540032> Refreshing instance list...\n\n'
'(Due to extensive quality checks, this may take a bit.)')
plausible: List[str] = []
# Get, parse, and quality check all instances
async with self.request.get('https://searx.space/data/instances.json') as r:
# Parsing
searx_json = await r.json()
instances = searx_json['instances']
# Quality Check
for i in instances:
content = instances.get(i)
is_good: bool = await self._instance_check(i, content)
if is_good:
plausible.append(i)
# Save new list
self.instances = plausible
with open('searxes.txt', 'w') as f:
f.write('\n'.join(plausible))
await msg.edit(content='Instances refreshed!')
async def _old_basic_search(self, ctx, query: str,
category: str = None):
"""Base search message generation."""
async with ctx.typing():
is_nsfw = (
ctx.channel.is_nsfw() if hasattr(ctx.channel, 'is_nsfw')
else False
)
msg = await self._old_search_logic(query, is_nsfw, category)
await ctx.send(msg)
await self.info(
content=(
f"**{ctx.author}** searched for `{query}` "
f"in \"{ctx.guild}\" and got this:"
f"\n\n{msg}"
),
name="Search Results"
)