Coverage for fanfics\custom_models.py : 46%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Character, CharacterFanfic, Chapter)
def fetch_page(url, auth_url, cookie_name): """ Get the html of the url """ 'password': FANFIC_PASSWORD}
timeout=(connect_timeout, read_timeout)).text except requests.exceptions.RequestException as e: logger.error("Error fetching the page of the fanfic {}".format(e)) page_html = None
""" Set the page html """
""" Get the page html """
""" Parse html to get title and author """
""" Get the fandom and media type """ return "Avengers", "movies"
""" Get language """ return
""" Get genres """ return []
""" Get status """ return False
""" Get last time updated """ return
""" Get the chapters """ page_html = self.get_page_html().find('div', id="chapters") chapters = [] if page_html: chapters_html = page_html.find_all('ol') if chapters_html: chapters_html = chapters_html[0].find_all('li')
cont = 1 for li in chapters_html: chapter_obj = {} chapter_obj["title"] = li.find_all('h3')[0].text chapter_obj["num"] = cont chapter_obj["url"] = li.find_all( 'h3')[0].find_all('a')[0]['href'] chapters.append(chapter_obj) cont += 1
return chapters
""" Get number of words """ return
""" Get rating """ return
""" Get characters """ return []
""" Get data to send log in data """
""" Check if the url is a fanfic or a chapter """ page_html = self.get_page_html().find('div', id="chapter") if page_html is None: # does not exists, so it's a fanfic return True return False
""" Get the fandom and media type """
fandom = 'other'
media_type = "anime" media_type == "comics" or media_type == "games" or media_type == "movies" or media_type == "tv"): pass media_type = "plays" else:
""" Get genres """
info = info.replace("sci-fi", "fantasy") return []
""" Get status """
return True else:
""" Get last time updated """ page_html = self.get_page_html().find('div', id="story") info = page_html.find_all('p', 'meta')[0].text # first occurrence date = None regex = re.search('(Updated:(?P<updated>.*?) -).*', info) if regex: # the story has been updated date = regex.group('updated').rstrip().lstrip() date = date.strip() else: # the story has not been updated regex = re.search('(Published:(?P<published>.*?) -).*', info) if regex: date = regex.group('published').rstrip().lstrip() date = date.strip()
if date is None: return
if "hour" in date or "minute" in date or "second" in date: date_time_obj = datetime.datetime.now() else: date_time_obj = datetime.datetime.strptime(date, '%Y-%m-%d') return date_time_obj
""" Get the chapters """ chapters_html = self.get_page_html().find('div', id="chapters") if chapters_html: chapters_html = chapters_html.find_all('ul', 'storylist')[0] chapters_html = chapters_html.find_all('li') else: # just one chapter chapters_html = self.get_page_html().find_all('div', 'storylist')
chapters = [] cont = 1 for li in chapters_html: chapter_obj = {} chapter_obj["title"] = li.find_all('h4')[0].text.lstrip().rstrip() chapter_obj["num"] = cont chapter_obj["url"] = "https://ficwad.com" + li.find_all( 'h4')[0].find_all( 'a')[0]['href'].rstrip().lstrip() chapters.append(chapter_obj) cont += 1
return chapters
""" Get number of words """ page_html = self.get_page_html().find('div', id="story") info = page_html.find_all('p', 'meta')[0].text # first occurrence words = None regex = re.search('.*- (?P<words>\d+).*words.*', info) if regex: # the story has been updated words = regex.group('words') words = words.strip()
return words
""" Get rating """
rating = "M" return rating
rating = "K" rating = "K+" elif rating == "r" or rating == "nc-17": rating = "M"
""" Get characters """ page_html = self.get_page_html().find('div', id="story") info = page_html.find_all('p', 'meta')[0].text # first occurrence
characters = [] regex = re.search('(Characters:(?P<characters>.*?)-).*', info) if regex: characters = regex.group('characters').rstrip().lstrip() characters = characters.split(",") return characters
""" Get site data to send log in data """
""" Check if the url is a fanfic or a chapter """ page_html = self.get_page_html().find_all('form', class_="chapterlist") if len(page_html) > 0: # exists, so it's a chapter return False return True
""" Get the fandom and media type """ 'ul')[0].find_all('li')
media_type = "tv" media_type = "movies"
media_type = "other" fandom = info[0].select('a')[0].text.lower().rstrip().lstrip() fandom = re.sub(r"\s*-\s*.*", "", fandom) return fandom, media_type
""" Get the language """ language = self.get_page_html().select('dd.language')[0].text.strip() return language
""" Get genres """ genres_list = [] genres = self.get_page_html().select('dd.freeform.tags') if genres: genres = genres[0].find_all('ul', 'commas')[0].find_all('li')
for li in genres: genre = li.find('a').text.strip().lower() if fanfic_model.is_genre_option(genre) is True: genres_list.append(genre) return genres_list
""" Get status """ status = self.get_page_html().select('dl.stats')[0].find_all( 'dd', 'chapters')[0].text.strip() if "?" in status: return False else: return True
""" Get last time updated """ info = self.get_page_html().select('dl.stats')[0]
if "Updated" in info: # the story has been updated date = info.select('dd.status')[0].text.strip() else: # the story has not been updated date = info.select('dd.published')[0].text.strip()
if date is None: return
date_time_obj = datetime.datetime.strptime(date, '%Y-%m-%d') return date_time_obj
""" Get the chapters """ chapters = [] chapters_html = self.get_page_html().find('select', id="selected_id") if chapters_html is None: # there is one chapter only´ chapter_obj = {} title, author = self.get_title_and_author() chapter_obj["title"] = title.lstrip().rstrip() chapter_obj["num"] = 1 chapter_obj["url"] = None chapters.append(chapter_obj) return chapters
chapters_html = chapters_html.find_all('option') url = self.get_page_html().find('ul', id='chapter_index') url = ("https://archiveofourown.org" + re.sub(r"\/\d+$", "", url.select('form')[0]['action']) + "/")
for li in chapters_html: chapter_obj = {} chapter_text = li.text
regex = re.search('^(?P<num>\d+)\.(?P<name>.*)$', chapter_text)
chapter_obj["title"] = regex.group('name').lstrip().rstrip() chapter_obj["num"] = regex.group('num').strip() chapter_obj["url"] = url + li['value'] chapters.append(chapter_obj)
return chapters
""" Get number of words """ info = self.get_page_html().select('dl.stats')[0] number_of_words = info.find_all('dd', 'words') if number_of_words: number_of_words = number_of_words[0].text.strip() else: number_of_words = None return number_of_words
""" Get rating """ rating_text = self.get_page_html().select( 'dd.rating.tags')[0].text.rstrip().lstrip().lower()
rating = None if "general" in rating_text: rating = "K" elif "teen" in rating_text: rating = "T" elif "mature" in rating_text: rating = "M"
return rating
""" Get characters """ return []
""" Get url to send log in data """ "_otwarchive_session")
""" Check if the url is a fanfic or a chapter """ # it doesn't matter here return True
""" Transform the url to remove the protocol and last slash """ url = re.compile(r"https?://(www\.)?") url = url.sub('', self.url).strip().strip('/') site = self.get_site() if "archiveofourown" in site: regex = re.compile(r"(\/chapters.*)?") url = regex.sub('', url).strip().strip('/') return url
""" Get the html of the url """
""" Get title and author """
""" Get language """
""" Get characters """ characters = self.scraper.get_characters() return characters
""" Get genres """
""" Get status """
""" Get last time updated """ last_time_updated = self.scraper.get_last_time_updated() return last_time_updated
""" Get the chapters """ chapters = self.scraper.get_chapters() return chapters
""" Get number of words """ num_words = self.scraper.get_num_words() return num_words
""" Get rating """
""" Scrape the fanfic's fields and save them """ with transaction.atomic(): new_fanfic = fanfic_model() fandom_fanfic_relation = FandomFanfic()
self.title, self.author = self.get_title_and_author() new_fanfic.name = self.title new_fanfic.author = self.author
new_fanfic.web = self.url
new_fanfic.save()
self.fandom, self.media_type = self.get_fandom_and_media_type() type_obj = Type.objects.get(name=self.media_type)
if self.media_type == "other": fandom_in_system = Fandom.objects.filter( name__icontains=self.fandom) else: fandom_in_system = Fandom.objects.filter( name__icontains=self.fandom, type=type_obj)
if fandom_in_system.exists(): fandom = fandom_in_system.first() else: new_fandom = Fandom() new_fandom.name = self.fandom new_fandom.type = type_obj new_fandom.save() fandom = new_fandom
fandom_fanfic_relation.fanfic = new_fanfic fandom_fanfic_relation.fandom = fandom fandom_fanfic_relation.is_primary = True fandom_fanfic_relation.save()
self.language = self.get_language() new_fanfic.language = self.language
self.genres = self.get_genres() for genre in self.genres: new_fanfic.add_genre(genre)
self.status = self.get_status() new_fanfic.complete = self.status
self.characters = self.get_characters() for character in self.characters: character_obj = Character.objects.filter( name_surname__icontains=character, fandom=fandom_fanfic_relation.fandom)
new_character_fanfic_relation = CharacterFanfic() if character_obj.exists(): character_obj = character_obj.first()
if not CharacterFanfic.objects.filter( character=character_obj, fanfic=new_fanfic).exists(): new_character_fanfic_relation.character = \ character_obj else: new_character = Character() new_character.name_surname = character new_character.fandom = fandom_fanfic_relation. \ fandom new_character.save() character_obj = new_character
new_character_fanfic_relation.character = character_obj new_character_fanfic_relation.fanfic = new_fanfic new_character_fanfic_relation.save()
self.last_time_updated = self.get_last_time_updated() new_fanfic.last_time_updated = self.last_time_updated
self.chapters = self.get_chapters() # array for chapter in self.chapters: new_chapter = Chapter() new_chapter.title = chapter['title'] new_chapter.num_chapter = chapter["num"] if chapter["url"] is None: chapter["url"] = self.url new_chapter.url_chapter = chapter["url"] new_chapter.fanfic = new_fanfic new_chapter.save()
self.num_words = self.get_num_words() new_fanfic.num_words = self.num_words
self.rating = self.get_rating() new_fanfic.rating = self.rating
# save it again new_fanfic.save() return new_fanfic
""" Check if the url is a fanfic or a chapter """ return self.scraper.check_if_is_fanfic_or_chapter()
""" Get the site name of url """ "archiveofourown.org"]
""" Check if url format is ok """ elif (site == "avengersfanfiction.com" and "Story" in str(self.url)): return True elif (site == "archiveofourown.org" and "works" in str( self.url)): return True
""" check if 200 """ timeout=(connect_timeout, read_timeout)) except Exception: return False
""" Check if a url is completely correct """ clean_url_fanfic = clean_string(self.url)
if "view_full_work" in clean_url_fanfic: # ao3 self.url = clean_url_fanfic.replace( "?view_full_work=true", "")
if clean_url_fanfic is None: # bad written url return "Error: Sorry, are you sure the url is correct?"
format_ok = self.check_url_format() if format_ok is False: # wrong site or error in url return "Error: Sorry, are you sure the url is valid?"
return self.url
""" Set the appropiate scraper for the url """ |