让gpt定量统计了nodeseek和hostloc的活跃度

lingjiuys

898days ago

#11

看这个数据，ns的巅峰是去年黑五的时候，

shuai管理交易中介Dev侦探

898days ago

#12

不能让一个平台独占网络，就像支付宝和微信一样，必须要有竞争对手，才能持久

Debian楼主

898days ago

#13

@290 #10

很基础的对话就可以了，我先写了十几行，说还有很多功能我懒得写了，帮我扩展，然后gpt就会按要求继续写，感觉哪里不对就告诉他，他就会说抱歉重新改了下，反复修改就好了

叽叽喳喳

898days ago

#14

xhj007 爬虫：从入门到...

Debian楼主

892days ago edited 892days ago

#15

2024-01-11 更新，原来的代码没什么大的问题，但是横坐标的时间用的是UTC时间，和中国时区差了8个小时。也就是说，统计的某天的主题帖数目是按照UTC时间来的，看个趋势没有太大的问题，但是想要获得某天的主题帖数量精确绝对值就不是很准了，因此稍微修改了一下。

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import matplotlib.pyplot as plt
from collections import defaultdict
import matplotlib.dates as mdates
from matplotlib.ticker import MultipleLocator
import pytz
# 设置全局时区为 Asia/Shanghai
shanghai_tz = pytz.timezone('Asia/Shanghai')
datetime.now(shanghai_tz)

import seaborn as sns
sns.set_theme()

proxies = {
    'http': 'http://127.0.0.1:1081',
    'https': 'http://127.0.0.1:1081',
}

def get_message_id(result):
    # 从消息链接中提取id

    link = result.select('[href^="https://t.me/serveruniverse/"]')[0]['href']
    message_id = link.split('/')[-1]

    return message_id

def get_messages(url):
    response = requests.get(url, proxies=proxies)
    soup = BeautifulSoup(response.content, "html.parser")
    results = soup.select('.tgme_widget_message_wrap.js-widget_message_wrap')
    return results

def parse_messages(url, stat = None):
    if(stat is None):
        # 初始化一个字典用于存储时间戳
        stat = {'nodeseek': [], 'hostloc': []}
        prev_message_ids = set()  # 用于存储上次循环中出现的消息id
    else:
        prev_message_ids =  set(message_id for (date, message_id) in stat['nodeseek'])
        prev_message_ids |= set(message_id for (date, message_id) in stat['hostloc'])

    while True:
        try:
            results = get_messages(url)

            if str(results[0]).find('No posts found') != -1:
                break
                
            # 如果当前循环中的所有消息id都在上次循环中出现过，则结束
            current_message_ids = {get_message_id(result) for result in results}
            print('current id', str(max(current_message_ids)), '/', str(max_id))
            if current_message_ids.issubset(prev_message_ids):
                break

            # 遍历每个消息
            for result in results:
                # 检查消息来源是 nodeseek 还是 hostloc
                if str(result).find('#nodeseek') != -1:
                    source = 'nodeseek'
                elif str(result).find('#hostloc') != -1:
                    source = 'hostloc'
                else:
                    # other source
                    continue

                    
                # 提取消息id
                message_id = get_message_id(result)

                # 如果消息id小于等于初始消息id，跳过，确保不重复统计
                if message_id in prev_message_ids:
                    continue

                # 提取时间戳字符串
                date_string = result.select('[href^="https://t.me/serveruniverse/"] time')[0]['datetime']

                # 将时间戳字符串转换为日期
                date = datetime.fromisoformat(date_string).astimezone(shanghai_tz).strftime('%Y-%m-%d')

                # 将日期和来源添加到对应的列表中
                stat[source].append((date, message_id))
                

            # 更新url为下一页链接
            url = 'https://t.me/s/serveruniverse/?after=' + max(current_message_ids)

            # 更新上次循环中出现的消息id集合
            prev_message_ids.update(current_message_ids)
        except Exception as e:
            print(e)
            continue
        except KeyboardInterrupt:
            break
        except:
            print('other error')

    return stat



def plot_messages(stat):
    # 统计每天的消息数量
    daily_counts = defaultdict(lambda: {'nodeseek': 0, 'hostloc': 0})
    for source, messages in result_stat.items():
        for date, _ in messages:
            daily_counts[date][source] += 1
            
    # 提取日期和每个来源的消息数量
    today = datetime.now().astimezone(shanghai_tz).strftime('%Y-%m-%d')

    dates = sorted(daily_counts.keys())
    dates = [date for date in dates if daily_counts[date]['hostloc'] > 100 or date == today] # 过滤掉一些异常数值的日期

    nodeseek_counts = [daily_counts[date]['nodeseek'] for date in dates]
    hostloc_counts =  [daily_counts[date]['hostloc'] for date in dates]        

    # 绘制折线图
    plt.figure(figsize=(10, 8))
    plt.plot(dates, nodeseek_counts, label='nodeseek', marker='o')
    plt.plot(dates, hostloc_counts, label='hostloc', marker='o')

    plt.xlabel('Date')
    plt.ylabel('Number of Daily Articles')
    plt.title('Statistics of daily articles from hostloc and nodeseek')
    plt.legend()
    
    # 设置xticks的数量为20
    interval = max(1, len(dates) // 20)
    plt.xticks(dates[::-interval][::-1], rotation=45, ha='right')
   
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.axis([None, None, 0, 700])

    
start_id = '1000'
try:
    start_id = result_stat['hostloc'][len(result_stat['hostloc']) - 1][1]
except Exception as e:
    result_stat = None
    print(e)
    pass

url = 'https://t.me/s/serveruniverse'
max_id = max(get_message_id(result) for result in get_messages(url))
result_stat = parse_messages(url + '/?after=' + start_id, result_stat)
# 绘制折线图
plot_messages(result_stat)

让gpt定量统计了nodeseek和hostloc的活跃度

你好啊，陌生人!

快捷功能区

📈用户数目📈

🎉欢迎新用户🎉

所有版块

让gpt定量统计了nodeseek和hostloc的活跃度

你好啊，陌生人!

快捷功能区

所有版块

📈用户数目📈

🎉欢迎新用户🎉