让gpt定量统计了nodeseek和hostloc的活跃度

Debian楼主

899days ago edited 895days ago in 日常

作为机界的忠实用户，经常在上面看看各大论坛有没有什么优惠活动

昨天突发奇想，能不能把频道的消息都爬下来，统计每个帖子里#hostloc和#nodeseek的出现时间，做一个人气变化图

说干就干，从网上找找爬虫教程😎，搞了会发现太麻烦了😭。想到还有gpt，于是就试了下，没想到效果很好 ~~，孩子很爱吃，敏感肌也能用~~

先放结果

Statistics of daily articles from hostloc and nodeseek，imgur网站需要科学

图片在imgur，部分地区需要科学才能看
尝试分析 yct022

从每日主题数目来看，当前ns和loc的活跃度差不多
loc在开放注册那天，突然增加了很多主题帖，应该是新用户加入的原因
不知道什么原因，loc从开放注册后，用户活跃度持续走低
ns日活曾经在07-28日左右开始小幅度下滑了一月左右
由于今天还没结束，所以最后末尾有个异常数据

再放代码

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import matplotlib.pyplot as plt
from collections import defaultdict
import matplotlib.dates as mdates
from matplotlib.ticker import MultipleLocator
import seaborn as sns
sns.set_theme()

proxies = {
    'http': 'http://127.0.0.1:1081',
    'https': 'http://127.0.0.1:1081',
}

def get_message_id(result):
    # 从消息链接中提取id

    link = result.select('[href^="https://t.me/serveruniverse/"]')[0]['href']
    message_id = link.split('/')[-1]

    return message_id

def get_messages(url):
    response = requests.get(url, proxies=proxies)
    soup = BeautifulSoup(response.content, "html.parser")
    results = soup.select('.tgme_widget_message_wrap.js-widget_message_wrap')
    return results

def parse_messages(url, stat = None):
    if(stat is None):
        # 初始化一个字典用于存储时间戳
        stat = {'nodeseek': [], 'hostloc': []}
        prev_message_ids = set()  # 用于存储上次循环中出现的消息id
    else:
        prev_message_ids  =  set(message_id for (date, message_id) in stat['nodeseek'])
        prev_message_ids |= set(message_id for (date, message_id) in stat['hostloc'])
    
    prev_message_ids = set()  # 用于存储上次循环中出现的消息id

    while True:
        try:
            results = get_messages(url)

            if str(results[0]).find('No posts found') != -1:
                break
                
            # 如果当前循环中的所有消息id都在上次循环中出现过，则结束
            current_message_ids = {get_message_id(result) for result in results}
            print('current id', str(max(current_message_ids)), '/', str(max_id))
            if current_message_ids.issubset(prev_message_ids):
                break

            # 遍历每个消息
            for result in results:
                # 检查消息来源是 nodeseek 还是 hostloc；
                if str(result).find('#nodeseek') != -1:
                    source = 'nodeseek'
                elif str(result).find('#hostloc') != -1:
                    source = 'hostloc'
                else:
                    # other source
                    continue

                    
                # 提取消息id
                message_id = get_message_id(result)

                # 如果消息id小于等于初始消息id，跳过，确保不重复统计
                if message_id in prev_message_ids:
                    continue

                # 提取时间戳字符串
                date_string = result.select('[href^="https://t.me/serveruniverse/"] time')[0]['datetime']

                # 将时间戳字符串转换为日期
                date = datetime.fromisoformat(date_string).strftime('%Y-%m-%d')

                # 将日期和来源添加到对应的列表中
                stat[source].append((date, message_id))
                

            # 更新url为下一页链接
            url = 'https://t.me/s/serveruniverse/?after=' + max(current_message_ids)

            # 更新上次循环中出现的消息id集合
            prev_message_ids.update(current_message_ids)
        except Exception as e:
            print(e)
            continue
        except KeyboardInterrupt:
            break
        except:
            print('other error')

    return stat

def plot_messages(stat):
    # 统计每天的消息数量
    daily_counts = defaultdict(lambda: {'nodeseek': 0, 'hostloc': 0})
    for source, messages in stat.items():
        for date, _ in messages:
            daily_counts[date][source] += 1

    # 提取日期和每个来源的消息数量
    dates = sorted(daily_counts.keys())
    nodeseek_dates = dates
    nodeseek_counts = [daily_counts[date]['nodeseek'] for date in dates]
    # 过滤掉一些异常数值
    hostloc_dates = [date for date in dates if daily_counts[date]['hostloc'] > 100]
    hostloc_counts = [daily_counts[date]['hostloc'] for date in dates if daily_counts[date]['hostloc'] > 100]
    
    # 绘制折线图
    plt.figure(figsize=(10, 8))
    plt.plot(nodeseek_dates, nodeseek_counts, label='nodeseek', marker='o')
    plt.plot(hostloc_dates, hostloc_counts, label='hostloc', marker='o')

    plt.xlabel('Date')
    plt.ylabel('Number of Daily Articles')
    plt.title('Statistics of daily articles from hostloc and nodeseek')
    plt.legend()
    
    # 设置xticks的数量为20
    interval = max(1, len(dates) // 20)
    plt.xticks(dates[::-interval][::-1], rotation=45, ha='right')
   
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.axis([None, None, 0, 700])

start_id = '1000'
try:
    start_id = result_stat['hostloc'][len(result_stat['hostloc']) - 1][1]
except Exception as e:
    result_stat = None
    print(e)
    pass

url = 'https://t.me/s/serveruniverse'
max_id = max(get_message_id(result) for result in get_messages(url))
result_stat = parse_messages(url + '/?after=' + start_id, result_stat)
# 绘制折线图
plot_messages(result_stat)

最后，不得不感叹，gpt真的yyds
~~如果代码有错，它背锅~~

Debian楼主

892days ago edited 892days ago

#15

2024-01-11 更新，原来的代码没什么大的问题，但是横坐标的时间用的是UTC时间，和中国时区差了8个小时。也就是说，统计的某天的主题帖数目是按照UTC时间来的，看个趋势没有太大的问题，但是想要获得某天的主题帖数量精确绝对值就不是很准了，因此稍微修改了一下。

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import matplotlib.pyplot as plt
from collections import defaultdict
import matplotlib.dates as mdates
from matplotlib.ticker import MultipleLocator
import pytz
# 设置全局时区为 Asia/Shanghai
shanghai_tz = pytz.timezone('Asia/Shanghai')
datetime.now(shanghai_tz)

import seaborn as sns
sns.set_theme()

proxies = {
    'http': 'http://127.0.0.1:1081',
    'https': 'http://127.0.0.1:1081',
}

def get_message_id(result):
    # 从消息链接中提取id

    link = result.select('[href^="https://t.me/serveruniverse/"]')[0]['href']
    message_id = link.split('/')[-1]

    return message_id

def get_messages(url):
    response = requests.get(url, proxies=proxies)
    soup = BeautifulSoup(response.content, "html.parser")
    results = soup.select('.tgme_widget_message_wrap.js-widget_message_wrap')
    return results

def parse_messages(url, stat = None):
    if(stat is None):
        # 初始化一个字典用于存储时间戳
        stat = {'nodeseek': [], 'hostloc': []}
        prev_message_ids = set()  # 用于存储上次循环中出现的消息id
    else:
        prev_message_ids =  set(message_id for (date, message_id) in stat['nodeseek'])
        prev_message_ids |= set(message_id for (date, message_id) in stat['hostloc'])

    while True:
        try:
            results = get_messages(url)

            if str(results[0]).find('No posts found') != -1:
                break
                
            # 如果当前循环中的所有消息id都在上次循环中出现过，则结束
            current_message_ids = {get_message_id(result) for result in results}
            print('current id', str(max(current_message_ids)), '/', str(max_id))
            if current_message_ids.issubset(prev_message_ids):
                break

            # 遍历每个消息
            for result in results:
                # 检查消息来源是 nodeseek 还是 hostloc
                if str(result).find('#nodeseek') != -1:
                    source = 'nodeseek'
                elif str(result).find('#hostloc') != -1:
                    source = 'hostloc'
                else:
                    # other source
                    continue

                    
                # 提取消息id
                message_id = get_message_id(result)

                # 如果消息id小于等于初始消息id，跳过，确保不重复统计
                if message_id in prev_message_ids:
                    continue

                # 提取时间戳字符串
                date_string = result.select('[href^="https://t.me/serveruniverse/"] time')[0]['datetime']

                # 将时间戳字符串转换为日期
                date = datetime.fromisoformat(date_string).astimezone(shanghai_tz).strftime('%Y-%m-%d')

                # 将日期和来源添加到对应的列表中
                stat[source].append((date, message_id))
                

            # 更新url为下一页链接
            url = 'https://t.me/s/serveruniverse/?after=' + max(current_message_ids)

            # 更新上次循环中出现的消息id集合
            prev_message_ids.update(current_message_ids)
        except Exception as e:
            print(e)
            continue
        except KeyboardInterrupt:
            break
        except:
            print('other error')

    return stat



def plot_messages(stat):
    # 统计每天的消息数量
    daily_counts = defaultdict(lambda: {'nodeseek': 0, 'hostloc': 0})
    for source, messages in result_stat.items():
        for date, _ in messages:
            daily_counts[date][source] += 1
            
    # 提取日期和每个来源的消息数量
    today = datetime.now().astimezone(shanghai_tz).strftime('%Y-%m-%d')

    dates = sorted(daily_counts.keys())
    dates = [date for date in dates if daily_counts[date]['hostloc'] > 100 or date == today] # 过滤掉一些异常数值的日期

    nodeseek_counts = [daily_counts[date]['nodeseek'] for date in dates]
    hostloc_counts =  [daily_counts[date]['hostloc'] for date in dates]        

    # 绘制折线图
    plt.figure(figsize=(10, 8))
    plt.plot(dates, nodeseek_counts, label='nodeseek', marker='o')
    plt.plot(dates, hostloc_counts, label='hostloc', marker='o')

    plt.xlabel('Date')
    plt.ylabel('Number of Daily Articles')
    plt.title('Statistics of daily articles from hostloc and nodeseek')
    plt.legend()
    
    # 设置xticks的数量为20
    interval = max(1, len(dates) // 20)
    plt.xticks(dates[::-interval][::-1], rotation=45, ha='right')
   
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.axis([None, None, 0, 700])

    
start_id = '1000'
try:
    start_id = result_stat['hostloc'][len(result_stat['hostloc']) - 1][1]
except Exception as e:
    result_stat = None
    print(e)
    pass

url = 'https://t.me/s/serveruniverse'
max_id = max(get_message_id(result) for result in get_messages(url))
result_stat = parse_messages(url + '/?after=' + start_id, result_stat)
# 绘制折线图
plot_messages(result_stat)

tujixiuzayu

899days ago

#1

我比较喜欢ns这个界面
Pepsi管理

899days ago

#2

还是喜欢ns
Dogelee2

899days ago

#3

之前瞎分析的
https://www.nodeseek.com/post-36334-1
香农

899days ago edited 899days ago

#4
Debian楼主

899days ago

#5

@Dogelee2 #3
原来有现成的工具呀，学到了学到了
SCP

899days ago

#6

@香农 #4

啊这，怎么还有人活在自己的夜郎自大的梦里，看图上的趋势，国内最大主机论坛都快要易主了
SCP

899days ago

#7

@香农 #4
呃，回复后刷新发现层主编辑了评论，建议编辑回去
4K-Ray管理

899days ago

#8

还是喜欢ns的ui
tina99

898days ago

#9

ns 比较好看
290

898days ago

#10

怎么让gpt搞出来的，有相关教程吗想学习下

让gpt定量统计了nodeseek和hostloc的活跃度

先放结果

再放代码

你好啊，陌生人!

快捷功能区

📈用户数目📈

🎉欢迎新用户🎉

所有版块

让gpt定量统计了nodeseek和hostloc的活跃度

先放结果

再放代码

你好啊，陌生人!

快捷功能区

所有版块

📈用户数目📈

🎉欢迎新用户🎉