From f7380ae15dbfc8f1d3a71d42c330e7827870f7a3 Mon Sep 17 00:00:00 2001 From: Ben Busby <33362396+benbusby@users.noreply.github.com> Date: Thu, 11 Jun 2020 13:21:40 -0600 Subject: [PATCH] Improving ad filtering for non-English languages --- app/filter.py | 7 ++++--- app/utils/misc.py | 5 +++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/app/filter.py b/app/filter.py index 9944441..1cc9f87 100644 --- a/app/filter.py +++ b/app/filter.py @@ -1,4 +1,5 @@ from app.request import VALID_PARAMS +from app.utils.misc import BLACKLIST from bs4 import BeautifulSoup from bs4.element import ResultSet from cryptography.fernet import Fernet @@ -47,8 +48,8 @@ def filter_link_args(query_link): return query_link -def has_ad_content(element): - return element == 'ad' or element == 'sponsoredⓘ' +def has_ad_content(element: str): + return element.upper() in (value.upper() for value in BLACKLIST) or 'ⓘ' in element class Filter: @@ -133,7 +134,7 @@ class Filter: return for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]: - has_ad = len([_ for _ in div.find_all('span', recursive=True) if has_ad_content(_.text.lower())]) + has_ad = len([_ for _ in div.find_all('span', recursive=True) if has_ad_content(_.text)]) _ = div.decompose() if has_ad else None def fix_question_section(self): diff --git a/app/utils/misc.py b/app/utils/misc.py index f959abe..b87941d 100644 --- a/app/utils/misc.py +++ b/app/utils/misc.py @@ -2,6 +2,11 @@ from cryptography.fernet import Fernet from flask import current_app as app REQUIRED_SESSION_VALUES = ['uuid', 'config', 'fernet_keys'] +BLACKLIST = [ + 'ad', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', 'Reklama', 'Реклама', 'Anunț', '광고', + 'annons', 'Annonse', 'Iklan', '広告', 'Augl.', 'Mainos', 'Advertentie', 'إعلان', 'Գովազդ', 'विज्ञापन', 'Reklam', + 'آگهی', 'Reklāma', 'Reklaam', 'Διαφήμιση', 'מודעה', 'Hirdetés' +] def generate_user_keys(cookies_disabled=False) -> dict: