![見出し画像](https://assets.st-note.com/production/uploads/images/159990342/rectangle_large_type_2_59bdd4495df214ad73949cc72d7bc2a1.png?width=1200)
フォロワー10万人への道 その3
Seleniumで書いたスクリプトをRequest+BeautifulSoupで書き直したら速度約1.7倍になりました。今後は後者を採用します。
![](https://assets.st-note.com/img/1730334463-rLnjK9F6yqPzbGTWd8ZXUxeu.png)
大まかな動作
埼玉県の口コミ、参考になった数上位30個を表示
30個の口コミを書いたそれぞれのユーザ公開ページにアクセス
そのユーザに参考を押しているユーザ全員の公開ページのURLを得る
結果、上位口コミ30件を書いたユーザは9名でした。
その9名に対しての参考ユーザ数を累積すると15736名で、重複を排除したところ、4038名でした。
Selenium版
#####
# Create a text file including "sankou" user
# 2024.10.14
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import pprint
import datetime
kuchikomi_top_url = "https://www.cityheaven.net/saitama/reviewlist/like/1/" # saitama pref. kuchikomi top
#
START_PAGE_OF_ITEM_LINKS = 1
END_PAGE_OF_ITEM_LINKS = 1
suffix_no = f"{START_PAGE_OF_ITEM_LINKS}-{END_PAGE_OF_ITEM_LINKS}"
#
REVIEW_ITEM_LINKS = f"review_item_links_{suffix_no}.txt"
REVIEWER_USER_HOME_LINKS = f"reviewer_user_home_links_{suffix_no}.txt"
UNDUP_REVIEWR_USER_HOME_LINKS = f'undup_reviewr_user_home_links_{suffix_no}.txt'
REFERRER_USER_HOME_LINKS = f"referrer_user_home_links_{suffix_no}.txt"
UNDUP_REFERRER_USER_HOME_LINKS = f"undup_referrer_user_home_links_{suffix_no}.txt"
def main():
options = Options()
options.add_argument('--blink-settings=imagesEnabled=false') # do not download images
options.add_argument('--ignore-certificate-errors') # avoid ssl error
options.add_argument('--ignore-ssl-errors')
options.add_argument('--log-level=1') # avoid TensorFlow message
options.page_load_strategy = 'normal'
global driver
driver = webdriver.Chrome(options=options)
# Step 1. create review item link
for i in list(range(START_PAGE_OF_ITEM_LINKS, END_PAGE_OF_ITEM_LINKS + 1)):
url = f"https://www.cityheaven.net/saitama/reviewlist/like/{i}/"
now = datetime.datetime.now()
print(f"{now} Step 1, To get item links by retrieving on {i}")
write_review_item_links(url)
# Step 2. create reviewer user home link
with open(REVIEW_ITEM_LINKS, mode="r") as f:
for line in f:
now = datetime.datetime.now()
print(f"{now} Step 2, To get reviewer home by retrieving on {line}")
write_reviwer_user_home(line)
# # Step 3. eliminate duplicated lines on reviewr item links
now = datetime.datetime.now()
print(f"{now} Start Step 3: Eliminate duplicated lines")
eliminate_duplicated_lines(REVIEWER_USER_HOME_LINKS, UNDUP_REVIEWR_USER_HOME_LINKS)
# # Step 4. get referrer home links on each reviewer
with open(UNDUP_REVIEWR_USER_HOME_LINKS, mode="r") as f:
for line in f:
referrer_list_top_url = line.replace('review-list', 'refer-user-list')
now = datetime.datetime.now()
print(f"{now} Step 4, To get referrer home by retriving on {referrer_list_top_url}")
write_referrer_user_home(referrer_list_top_url)
now = datetime.datetime.now()
print(f"{now} Step 4 finished")
# # Step 5. eliminate dupulicated lines on referrer links
eliminate_duplicated_lines(REFERRER_USER_HOME_LINKS, UNDUP_REFERRER_USER_HOME_LINKS)
now = datetime.datetime.now()
print(f"{now} Step 5 finished")
def write_review_item_links(url):
driver.get(url)
try:
elements = WebDriverWait(driver, 30).until(
EC.presence_of_all_elements_located((By.CLASS_NAME,"item-link"))
)
for element in elements:
item_link = element.get_attribute('href')
# pprint.pprint(item_link)
with open(REVIEW_ITEM_LINKS, mode = 'a') as o:
print(item_link, sep='\n', file=o)
except TimeoutError:
print("Timeout while wating for review items on " + url)
def write_reviwer_user_home(url):
driver.get(url)
try:
elements = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR,
"div.review-item-head > div.review-item-head-top > div > p > a"))
)
user_home = elements[0].get_attribute('href')
print(f"detect user home url: {user_home}")
with open(REVIEWER_USER_HOME_LINKS, mode = 'a') as o:
print(user_home, sep='\n', file=o)
except TimeoutException:
print("Timeout while wating for review items on " + url)
except NoSuchElementException:
print("cant find element on " + url)
def eliminate_duplicated_lines(input_filename, output_filename):
dup_lines = []
with open(input_filename, mode='r') as f:
for line in f:
dup_lines.append(line.strip())
print("####### duplicated ###########")
pprint.pprint(dup_lines)
uniq_lines = set(dup_lines)
print("####### unduplicated ###########")
pprint.pprint(uniq_lines)
for line in uniq_lines:
with open(output_filename, mode = 'a') as f:
print(line, sep="\n", file=f)
def write_referrer_user_home(url):
print(f"retriving on {url}")
driver.get(url)
while True:
try:
print(f"retriving on {driver.current_url}")
elements = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME,"item-link"))
)
for element in elements:
referrer_url = element.get_attribute('href')
print(referrer_url)
with open(REFERRER_USER_HOME_LINKS, mode = 'a') as f:
print(referrer_url, sep='\n', file=f)
if detect_next_button() == "last":
break
else:
next_button_element = driver.find_element(By.CSS_SELECTOR,"div.pager > a.next")
driver.execute_script("arguments[0].click();", next_button_element) # to click element out of view
except TimeoutException:
print("Timeout while wating for review items on " + url)
break
except NoSuchElementException:
print("cant find element on " + url)
break
def detect_next_button():
try:
next_button_element = driver.find_element(By.CSS_SELECTOR,"div.pager > a.next")
if next_button_element.text == "次へ":
print("detect next button")
return "next"
else:
print('no next button found')
return "last"
except NoSuchElementException:
print("No next button found, no such element")
return "last"
if __name__ == "__main__":
main()
Request+BeutifulSoup版
####
# create user list to follow
# 2024.10.28
import requests
from bs4 import BeautifulSoup
import datetime
import pprint
from urllib.parse import urljoin
BASE_URL = "https://www.cityheaven.net"
BASE_KUCHIKOMI_TOP_URL = f"{BASE_URL}/saitama/reviewlist/like"
START_PAGE_OF_ITEM_LINKS = 1
END_PAGE_OF_ITEM_LINKS = 1
PREFECTURE = "saitama"
suffix_no = f"{START_PAGE_OF_ITEM_LINKS}-{END_PAGE_OF_ITEM_LINKS}"
REVIEW_ITEM_LINKS = f"{PREFECTURE}_review_item_links_{suffix_no}.txt"
REVIEWER_USER_HOME_LINKS = f"{PREFECTURE}_reviewer_user_home_links_{suffix_no}.txt"
UNDUP_REVIEWR_USER_HOME_LINKS = f'{PREFECTURE}_undup_reviewr_user_home_links_{suffix_no}.txt'
REFERRER_USER_HOME_LINKS = f"{PREFECTURE}_referrer_user_home_links_{suffix_no}.txt"
UNDUP_REFERRER_USER_HOME_LINKS = f"{PREFECTURE}_undup_referrer_user_home_links_{suffix_no}.txt"
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}
def main():
# Step 1: Create review item links
for i in range(START_PAGE_OF_ITEM_LINKS, END_PAGE_OF_ITEM_LINKS + 1):
url = f"{BASE_KUCHIKOMI_TOP_URL}/{i}/"
now = datetime.datetime.now()
print(f"{now} Step 1, To get item links by retrieving on {i}")
write_review_item_links(url)
# Step 2: Create reviewer user home links
with open(REVIEW_ITEM_LINKS, mode="r") as f:
for line in f:
now = datetime.datetime.now()
print(f"{now} Step 2, To get reviewer home by retrieving on {line}")
write_reviewer_user_home(line.strip())
# Step 3: Eliminate duplicate lines
now = datetime.datetime.now()
print(f"{now} Start Step 3: Eliminate duplicated lines")
eliminate_duplicated_lines(REVIEWER_USER_HOME_LINKS, UNDUP_REVIEWR_USER_HOME_LINKS)
# Step 4: Get referrer home links
with open(UNDUP_REVIEWR_USER_HOME_LINKS, mode="r") as f:
for line in f:
referrer_list_top_url = line.strip().replace('review-list', 'refer-user-list')
now = datetime.datetime.now()
print(f"{now} Step 4, To get referrer home by retriving on {referrer_list_top_url}")
write_referrer_user_home(referrer_list_top_url)
now = datetime.datetime.now()
print(f"{now} Step 4 finished")
# Step 5: eliminate_duplicated_lines
eliminate_duplicated_lines(REFERRER_USER_HOME_LINKS, UNDUP_REFERRER_USER_HOME_LINKS)
now = datetime.datetime.now()
print(f"{now} Step 5 finished")
def write_review_item_links(url):
response = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(response.text, 'html.parser')
elements = soup.find_all(class_="item-link")
with open(REVIEW_ITEM_LINKS, mode='a') as f:
for element in elements:
item_link = element.get('href')
full_item_link = urljoin(BASE_URL,item_link)
f.write(full_item_link + "\n")
def write_reviewer_user_home(url):
response = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(response.text, 'html.parser')
elements = soup.select("div.review-item-head > div.review-item-head-top > div > p > a")
if elements:
user_home = elements[0].get('href')
full_user_home = urljoin(BASE_URL, user_home)
with open(REVIEWER_USER_HOME_LINKS, mode='a') as f:
f.write(full_user_home + "\n")
def eliminate_duplicated_lines(input_filename, output_filename):
with open(input_filename, 'r') as f:
lines = f.readlines()
uniq_lines = set(line.strip() for line in lines)
with open(output_filename, 'w') as f:
for line in uniq_lines:
f.write(line + "\n")
def write_referrer_user_home(url):
while url:
response = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(response.text, 'html.parser')
elements = soup.find_all(class_="item-link")
with open(REFERRER_USER_HOME_LINKS, mode='a') as f:
for element in elements:
referrer_url = element.get('href')
full_referrer_url = urljoin(BASE_URL,referrer_url)
f.write(full_referrer_url + "\n")
next_button = soup.select_one("div.pager > a.next")
# if next_button and next_button.text == "次へ":
if next_button and next_button.get('href'):
now = datetime.datetime.now()
url = urljoin(BASE_URL, next_button.get('href'))
print(f"{now} detected next button. now going to {url}")
else:
now = datetime.datetime.now()
print(f" no next button detected on {url}")
break
if __name__ == "__main__":
main()