from io import BytesIO from lxml import etree import pycurl import random import urllib.parse as urlparse # Core Google search URLs SEARCH_URL = 'https://www.google.com/search?gbv=1&q=' AUTOCOMPLETE_URL = 'https://suggestqueries.google.com/complete/search?client=toolbar&' MOBILE_UA = '{}/5.0 (Android 0; Mobile; rv:54.0) Gecko/54.0 {}/59.0' DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0' # Valid query params VALID_PARAMS = ['tbs', 'tbm', 'start', 'near'] def gen_user_agent(normal_ua, is_mobile): mozilla = random.choice(['Moo', 'Woah', 'Bro', 'Slow']) + 'zilla' firefox = random.choice(['Choir', 'Squier', 'Higher', 'Wire']) + 'fox' linux = random.choice(['Win', 'Sin', 'Gin', 'Fin', 'Kin']) + 'ux' if is_mobile: return MOBILE_UA.format(mozilla, firefox) else: return DESKTOP_UA.format(mozilla, linux, firefox) def gen_query(query, args, config, near_city=None): param_dict = {key: '' for key in VALID_PARAMS} # Use :past(hour/day/week/month/year) if available # example search "new restaurants :past month" if ':past' in query: time_range = str.strip(query.split(':past', 1)[-1]) param_dict['tbs'] = '&tbs=qdr:' + str.lower(time_range[0]) # Ensure search query is parsable query = urlparse.quote(query) # Pass along type of results (news, images, books, etc) if 'tbm' in args: param_dict['tbm'] = '&tbm=' + args.get('tbm') # Get results page start value (10 per page, ie page 2 start val = 20) if 'start' in args: param_dict['start'] = '&start=' + args.get('start') # Search for results near a particular city, if available if near_city: param_dict['near'] = '&near=' + urlparse.quote(near_city) # Set language for results (lr) and interface (hl) param_dict['lr'] = '&lr=' + config.lang + '&hl=' + config.lang.replace('lang_', '') param_dict['cr'] = ('&cr=' + config.ctry) if config.ctry else '' param_dict['safe'] = '&safe=' + ('active' if config.safe else 'off') for val in param_dict.values(): if not val or val is None: continue query += val return query class Request: def __init__(self, normal_ua, language='lang_en'): self.language = language self.mobile = 'Android' in normal_ua or 'iPhone' in normal_ua self.modified_user_agent = gen_user_agent(normal_ua, self.mobile) def __getitem__(self, name): return getattr(self, name) def get_decode_value(self): if 'lang_zh' in self.language: return 'gb2312' else: return 'unicode-escape' def autocomplete(self, query): ac_query = dict(hl=self.language, q=query) response = self.send(base_url=AUTOCOMPLETE_URL, query=urlparse.urlencode(ac_query)) if response: dom = etree.fromstring(response) return dom.xpath('//suggestion/@data') return [] def send(self, base_url=SEARCH_URL, query='', return_bytes=False): response_header = [] b_obj = BytesIO() crl = pycurl.Curl() crl.setopt(crl.URL, base_url + query) crl.setopt(crl.USERAGENT, self.modified_user_agent) crl.setopt(crl.WRITEDATA, b_obj) crl.setopt(crl.HEADERFUNCTION, response_header.append) crl.setopt(pycurl.FOLLOWLOCATION, 1) crl.perform() crl.close() if return_bytes: return b_obj.getvalue() else: return b_obj.getvalue().decode(self.get_decode_value(), 'ignore')