
フォロワー10万人への道 その5
結果
実行前はフォロー2万人。

実行後。

フォロー37000人増となりました。
まだ増えていますが、投稿後、1週間ぐらいでフォロー数の2割の方が参考になったボタンを押していただいているようです。


Step 1:フォロワー抽出
強引に並列処理。コマンドはExcelで生成。
python get_referrer_w_requests.py hokkaido 1 120 &
python get_referrer_w_requests.py aomori 1 120
python get_referrer_w_requests.py aomori 1 120 &
python get_referrer_w_requests.py miyagi 1 120 &
python get_referrer_w_requests.py iwate 1 120 &
python get_referrer_w_requests.py akita 1 120 &
python get_referrer_w_requests.py yamagata 1 120 &
python get_referrer_w_requests.py fukushima 1 120 &
python get_referrer_w_requests.py ibaraki 1 120 &
python get_referrer_w_requests.py tochigi 1 120 &
python get_referrer_w_requests.py gunma 1 120 &
python get_referrer_w_requests.py saitama 1 120 &
python get_referrer_w_requests.py chiba 1 120 &
python get_referrer_w_requests.py tokyo 1 120 &
python get_referrer_w_requests.py kanagawa 1 120 &
python get_referrer_w_requests.py nigata 1 120 &
python get_referrer_w_requests.py toyama 1 120 &
python get_referrer_w_requests.py gifu 1 120 &
python get_referrer_w_requests.py ishikawa 1 120 &
python get_referrer_w_requests.py fukui 1 120 &
python get_referrer_w_requests.py yamanashi 1 120 &
python get_referrer_w_requests.py nagano 1 120 &
python get_referrer_w_requests.py shizuoka 1 120 &
python get_referrer_w_requests.py aichi 1 120 &
python get_referrer_w_requests.py mie 1 120 &
python get_referrer_w_requests.py shiga 1 120 &
python get_referrer_w_requests.py kyoto 1 120 &
python get_referrer_w_requests.py osaka 1 120 &
python get_referrer_w_requests.py hyogo 1 120 &
python get_referrer_w_requests.py nara 1 120 &
python get_referrer_w_requests.py wakayama 1 120 &
python get_referrer_w_requests.py tottori 1 120 &
python get_referrer_w_requests.py shimane 1 120 &
python get_referrer_w_requests.py okayama 1 120 &
python get_referrer_w_requests.py hiroshima 1 120 &
python get_referrer_w_requests.py yamaguchi 1 120 &
python get_referrer_w_requests.py tokushima 1 120 &
python get_referrer_w_requests.py kagawa 1 120 &
python get_referrer_w_requests.py ehime 1 120 &
python get_referrer_w_requests.py kochi 1 120 &
python get_referrer_w_requests.py fukuoka 1 120 &
python get_referrer_w_requests.py saga 1 120 &
python get_referrer_w_requests.py nagasaki 1 120 &
python get_referrer_w_requests.py kumamoto 1 120 &
python get_referrer_w_requests.py oita 1 120 &
python get_referrer_w_requests.py miyazaki 1 120 &
python get_referrer_w_requests.py kagoshima 1 120 &
python get_referrer_w_requests.py okinawa 1 120 &
都道府県のローマ字はこちらから拝借。
https://gist.github.com/koseki/38926
forループの中で実行したいコマンドをアンパサンドでバックグラウンド実行していけば、並列処理にはなりその方がスマートでした。
エラーが出ていたようにも見えますが、数時間の外出から帰ってきて実行完了していました。
[1] Done python get_referrer_w_requests.py hokkaido 1 120
[2] Done python get_referrer_w_requests.py aomori 1 120
[3] Done python get_referrer_w_requests.py iwate 1 120
[4] Done python get_referrer_w_requests.py miyagi 1 120
[5] Done python get_referrer_w_requests.py akita 1 120
[6] Done python get_referrer_w_requests.py yamagata 1 120
[7] Done python get_referrer_w_requests.py fukushima 1 120
[8] Done python get_referrer_w_requests.py ibaraki 1 120
[9] Done python get_referrer_w_requests.py tochigi 1 120
[10] Done python get_referrer_w_requests.py gunma 1 120
[11] Done python get_referrer_w_requests.py saitama 1 120
[12] Done python get_referrer_w_requests.py chiba 1 120
[13] Done python get_referrer_w_requests.py tokyo 1 120
[14] Done python get_referrer_w_requests.py kanagawa 1 120
[15] Exit 1 python get_referrer_w_requests.py nigata 1 120
[16] Done python get_referrer_w_requests.py toyama 1 120
[17] Done python get_referrer_w_requests.py gifu 1 120
[18] Done python get_referrer_w_requests.py ishikawa 1 120
[19] Done python get_referrer_w_requests.py fukui 1 120
[20] Done python get_referrer_w_requests.py yamanashi 1 120
[21] Done python get_referrer_w_requests.py nagano 1 120
[22] Done python get_referrer_w_requests.py shizuoka 1 120
[23] Done python get_referrer_w_requests.py aichi 1 120
[24] Done python get_referrer_w_requests.py mie 1 120
[25] Exit 1 python get_referrer_w_requests.py shiga 1 120
[26] Done python get_referrer_w_requests.py kyoto 1 120
[27] Done python get_referrer_w_requests.py osaka 1 120
[28] Done python get_referrer_w_requests.py hyogo 1 120
[29] Done python get_referrer_w_requests.py nara 1 120
[30] Done python get_referrer_w_requests.py wakayama 1 120
[31] Exit 1 python get_referrer_w_requests.py tottori 1 120
[32] Done python get_referrer_w_requests.py shimane 1 120
[33] Done python get_referrer_w_requests.py okayama 1 120
[34] Done python get_referrer_w_requests.py hiroshima 1 120
[35] Done python get_referrer_w_requests.py yamaguchi 1 120
[36] Done python get_referrer_w_requests.py tokushima 1 120
[37] Done python get_referrer_w_requests.py kagawa 1 120
[38] Done python get_referrer_w_requests.py ehime 1 120
[39] Done python get_referrer_w_requests.py kochi 1 120
[40] Done python get_referrer_w_requests.py fukuoka 1 120
[41] Done python get_referrer_w_requests.py saga 1 120
[42] Done python get_referrer_w_requests.py nagasaki 1 120
[43] Done python get_referrer_w_requests.py kumamoto 1 120
[44] Done python get_referrer_w_requests.py oita 1 120
[45] Done python get_referrer_w_requests.py miyazaki 1 120
[46]- Done python get_referrer_w_requests.py kagoshima 1 120
[47]+ Done python get_referrer_w_requests.py okinawa 1 120
Step 2:重複排除
各都道府県ごとのデータは重複排除済みで、それをそのまま足し合わせると24万件でした。それをまた重複排除したら69000件ほどでした。
$ ls | grep undup_referrer_user_home_links | xargs -I {} cat {} | wc -l
240486
$ ls | grep undup_referrer_user_home_links | xargs -I {} cat {} | sort | uniq | wc -l
69933
$ ls | grep undup_referrer_user_home_links | xargs -I {} cat {} | sort | uniq > followlist1105.txt
Step 3:フォロー
5000件ごとに分割して並列処理です。
cat followlist1105.txt | head -n 5000 | tail -n 5000 > follow1.txt
cat followlist1105.txt | head -n 10000 | tail -n 5000 > follow2.txt
cat followlist1105.txt | head -n 15000 | tail -n 5000 > follow3.txt
cat followlist1105.txt | head -n 20000 | tail -n 5000 > follow4.txt
cat followlist1105.txt | head -n 25000 | tail -n 5000 > follow5.txt
cat followlist1105.txt | head -n 30000 | tail -n 5000 > follow6.txt
cat followlist1105.txt | head -n 35000 | tail -n 5000 > follow7.txt
cat followlist1105.txt | head -n 40000 | tail -n 5000 > follow8.txt
cat followlist1105.txt | head -n 45000 | tail -n 5000 > follow9.txt
cat followlist1105.txt | head -n 50000 | tail -n 5000 > follow10.txt
cat followlist1105.txt | head -n 55000 | tail -n 5000 > follow11.txt
cat followlist1105.txt | head -n 60000 | tail -n 5000 > follow12.txt
cat followlist1105.txt | head -n 65000 | tail -n 5000 > follow13.txt
cat followlist1105.txt | head -n 70000 | tail -n 5000 > follow14.txt
$ python follow_users_w_selenium.py follow1.txt &
python follow_users_w_selenium.py follow2.txt &
python follow_users_w_selenium.py follow3.txt &
python follow_users_w_selenium.py follow4.txt &
python follow_users_w_selenium.py follow5.txt &
python follow_users_w_selenium.py follow6.txt &
python follow_users_w_selenium.py follow7.txt &
python follow_users_w_selenium.py follow8.txt &
python follow_users_w_selenium.py follow9.txt &
python follow_users_w_selenium.py follow10.txt &
python follow_users_w_selenium.py follow11.txt &
python follow_users_w_selenium.py follow12.txt &
python follow_users_w_selenium.py follow13.txt &
python follow_users_w_selenium.py follow14.txt &
[1] 3109
[2] 3114
[3] 3119
[4] 3124
[5] 3129
[6] 3134
[7] 3139
[8] 3144
[9] 3149
[10] 3154
[11] 3159
[12] 3164
[13] 3169
[14] 3174
実行中の画面です。Chrome14個並列で動作。

画面がパラパラと切り替わりながらフォローを繰り返します。

エラー3件。
[1] Exit 1 python follow_users_w_selenium.py follow1.txt
[4] Exit 1 python follow_users_w_selenium.py follow4.txt
[10] Exit 1 python follow_users_w_selenium.py follow10.txt
Tracebackを見るとchromedriverのパーミッションエラーになっていました。原因不明です。
selenium.common.exceptions.WebDriverException: Message: 'chromedriver.exe' executable may have wrong permissions.
寝る前たしか0時頃仕掛けて、朝4時に終わっていました。15000件/時、つまり4件/秒ぐらい。自動操作のブラウザの動きを見ていて体感と一致です。
https://www.cityheaven.net/userpage/50176408/review-list/ has been added for your follow memebers !
https://www.cityheaven.net/userpage/50177775/review-list/ has been added for your follow memebers !
https://www.cityheaven.net/userpage/50178149/review-list/ has been added for your follow memebers !
finish at 2024-11-05 04:31:24.359978
69000件フォロー試行し、これがもしすべて新規フォローなら、結果、89000フォローになります。ですが、すでに20000フォローしていますので、たとえばこれがすべて69000件に含まれてフォロー済みだとしても、49000件は新規フォロー対象になるはずですから、最低でも69000人フォローになるはずです。実際は57000件でしたので、エラーで12000件フォロー試行できていない可能性が高いです。なので先ほどの3件と、以下1件のExit、計4件フォロー試行してみます。
[2] Done python follow_users_w_selenium.py follow2.txt
[3] Done python follow_users_w_selenium.py follow3.txt
[5] Done python follow_users_w_selenium.py follow5.txt
[6] Done python follow_users_w_selenium.py follow6.txt
[7] Exit 1 python follow_users_w_selenium.py follow7.txt
[8] Done python follow_users_w_selenium.py follow8.txt
[9] Done python follow_users_w_selenium.py follow9.txt
[11] Done python follow_users_w_selenium.py follow11.txt
[12] Done python follow_users_w_selenium.py follow12.txt
[13]- Done python follow_users_w_selenium.py follow13.txt
[14]+ Done python follow_users_w_selenium.py follow14.txt
$ python follow_users_w_selenium.py follow7.txt &
python follow_users_w_selenium.py follow1.txt &
python follow_users_w_selenium.py follow4.txt &
python follow_users_w_selenium.py follow10.txt &
[1] 3187
[2] 3192
[3] 3197
[4] 3202
プロセス4つ並列なので14個動かしていた時に比べて、サクサクうごいています。

フォロー69828人まで増えました。

やり直し分
Step1で、新潟のつづりはヘブンではnigataではなくniigataだったのでエラーとなっていました。tottoriは風俗店が1店舗しか登録がなく、口コミもないのでエラーでした。新潟と滋賀の分をやり直ししています。
$ python get_referrer_w_requests.py niigata 1 120 &
python get_referrer_w_requests.py shiga 1 120 &
python get_referrer_w_requests.py tottori 1 120 &
[1] 3219
[2] 3224
[3] 3229
(TBD)
スクリプト
変数を引数化しました。
get_referrer_w_requests.py
####
# create user list to follow
# how to use
#
# python get_referrer_w_requests.py pref start_of_page end_of_page
#
# sample
# python get_referrer_w_requests.py saitama 1 10
import sys
import requests
from bs4 import BeautifulSoup
import datetime
import pprint
from urllib.parse import urljoin
args = sys.argv
PREFECTURE = args[1]
START_PAGE_OF_ITEM_LINKS = int(args[2])
END_PAGE_OF_ITEM_LINKS = int(args[3])
BASE_URL = "https://www.cityheaven.net"
BASE_KUCHIKOMI_TOP_URL = f"{BASE_URL}/{PREFECTURE}/reviewlist/like"
suffix_no = f"{START_PAGE_OF_ITEM_LINKS}-{END_PAGE_OF_ITEM_LINKS}"
REVIEW_ITEM_LINKS = f"{PREFECTURE}_review_item_links_{suffix_no}.txt"
REVIEWER_USER_HOME_LINKS = f"{PREFECTURE}_reviewer_user_home_links_{suffix_no}.txt"
UNDUP_REVIEWR_USER_HOME_LINKS = f'{PREFECTURE}_undup_reviewr_user_home_links_{suffix_no}.txt'
REFERRER_USER_HOME_LINKS = f"{PREFECTURE}_referrer_user_home_links_{suffix_no}.txt"
UNDUP_REFERRER_USER_HOME_LINKS = f"{PREFECTURE}_undup_referrer_user_home_links_{suffix_no}.txt"
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}
def main():
# Step 1: Create review item links
for i in range(START_PAGE_OF_ITEM_LINKS, END_PAGE_OF_ITEM_LINKS + 1):
url = f"{BASE_KUCHIKOMI_TOP_URL}/{i}/"
now = datetime.datetime.now()
print(f"{now} Step 1, To get item links by retrieving on {i}")
write_review_item_links(url)
# Step 2: Create reviewer user home links
with open(REVIEW_ITEM_LINKS, mode="r") as f:
for line in f:
now = datetime.datetime.now()
print(f"{now} Step 2, To get reviewer home by retrieving on {line}")
write_reviewer_user_home(line.strip())
# Step 3: Eliminate duplicate lines
now = datetime.datetime.now()
print(f"{now} Start Step 3: Eliminate duplicated lines")
eliminate_duplicated_lines(REVIEWER_USER_HOME_LINKS, UNDUP_REVIEWR_USER_HOME_LINKS)
# Step 4: Get referrer home links
with open(UNDUP_REVIEWR_USER_HOME_LINKS, mode="r") as f:
for line in f:
referrer_list_top_url = line.strip().replace('review-list', 'refer-user-list')
now = datetime.datetime.now()
print(f"{now} Step 4, To get referrer home by retriving on {referrer_list_top_url}")
write_referrer_user_home(referrer_list_top_url)
now = datetime.datetime.now()
print(f"{now} Step 4 finished")
# Step 5: eliminate_duplicated_lines
eliminate_duplicated_lines(REFERRER_USER_HOME_LINKS, UNDUP_REFERRER_USER_HOME_LINKS)
now = datetime.datetime.now()
print(f"{now} Step 5 finished")
def write_review_item_links(url):
response = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(response.text, 'html.parser')
elements = soup.find_all(class_="item-link")
with open(REVIEW_ITEM_LINKS, mode='a') as f:
for element in elements:
item_link = element.get('href')
full_item_link = urljoin(BASE_URL,item_link)
f.write(full_item_link + "\n")
def write_reviewer_user_home(url):
response = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(response.text, 'html.parser')
elements = soup.select("div.review-item-head > div.review-item-head-top > div > p > a")
if elements:
user_home = elements[0].get('href')
full_user_home = urljoin(BASE_URL, user_home)
with open(REVIEWER_USER_HOME_LINKS, mode='a') as f:
f.write(full_user_home + "\n")
def eliminate_duplicated_lines(input_filename, output_filename):
with open(input_filename, 'r') as f:
lines = f.readlines()
uniq_lines = set(line.strip() for line in lines)
with open(output_filename, 'w') as f:
for line in uniq_lines:
f.write(line + "\n")
def write_referrer_user_home(url):
while url:
response = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(response.text, 'html.parser')
elements = soup.find_all(class_="item-link")
with open(REFERRER_USER_HOME_LINKS, mode='a') as f:
for element in elements:
referrer_url = element.get('href')
full_referrer_url = urljoin(BASE_URL,referrer_url)
f.write(full_referrer_url + "\n")
next_button = soup.select_one("div.pager > a.next")
# if next_button and next_button.text == "次へ":
if next_button and next_button.get('href'):
now = datetime.datetime.now()
url = urljoin(BASE_URL, next_button.get('href'))
print(f"{now} detected next button. now going to {url}")
else:
now = datetime.datetime.now()
print(f" no next button detected on {url}")
break
if __name__ == "__main__":
main()
follow_users_w_selenium.py
#####
# Follow users listed in a text file
# how to use
#
# python follow_users_w_selenium.py list_to_follow.txt
#
# sample
# python follow_users_w_selenium.py tokyo_undup_referrer_user_home_links_1-60.txt
#
import sys
from selenium import webdriver
import config
import datetime
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, WebDriverException
args = sys.argv
USERS_TO_FOLLOW = args[1]
LOGIN_PAGE = 'https://www.cityheaven.net/mypage/login/'
def main():
options = Options()
# options.add_argument('--headless') # headless mode
options.add_argument('--blink-settings=imagesEnabled=false') # do not download images
options.add_argument('--ignore-certificate-errors') # avoid ssl error
options.add_argument('--ignore-ssl-errors')
options.add_argument('--log-level=1') # avoid TensorFlow message
options.page_load_strategy = 'normal'
global driver
driver = webdriver.Chrome(options=options)
heaven_login(LOGIN_PAGE) # login
now = datetime.datetime.now()
print(f"start at {now}")
with open(USERS_TO_FOLLOW) as f: # open text file
for userurl in f: # read each line
follow_user(userurl) # follow user
now = datetime.datetime.now()
print(f"finish at {now}")
def heaven_login(url):
driver.get(url)
email = driver.find_element(By.XPATH,'//*[@id="user"]')
email.send_keys(config.username)
password = driver.find_element(By.XPATH,'//*[@id="pass"]')
password.send_keys(config.password + '\n')
def follow_user(url):
driver.get(url)
try:
element = driver.find_element(By.CLASS_NAME,"button-outline._user.follow-btn:not(.is-follow)")
driver.execute_script("arguments[0].click();", element)
print(url.strip('\r\n') + " has been added for your follow memebers !")
except NoSuchElementException:
print(url.strip('\r\n') + " is existing as your following member.")
except WebDriverException:
print(url.strip('\r\n') + " network issue occured")
if __name__ == '__main__':
main()