logo NodeSeekbeta

让gpt定量统计了nodeseek和hostloc的活跃度

12
  • 看这个数据,ns的巅峰是去年黑五的时候,

  • 不能让一个平台独占网络,就像支付宝和微信一样,必须要有竞争对手,才能持久

  • @290 #10

    很基础的对话就可以了,我先写了十几行,说还有很多功能我懒得写了,帮我扩展,然后gpt就会按要求继续写,感觉哪里不对就告诉他,他就会说抱歉重新改了下,反复修改就好了

  • xhj007 爬虫:从入门到...

  • 2024-01-11 更新,原来的代码没什么大的问题,但是横坐标的时间用的是UTC时间,和中国时区差了8个小时。也就是说,统计的某天的主题帖数目是按照UTC时间来的,看个趋势没有太大的问题,但是想要获得某天的主题帖数量精确绝对值就不是很准了,因此稍微修改了一下。

    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import matplotlib.pyplot as plt
    from collections import defaultdict
    import matplotlib.dates as mdates
    from matplotlib.ticker import MultipleLocator
    import pytz
    # 设置全局时区为 Asia/Shanghai
    shanghai_tz = pytz.timezone('Asia/Shanghai')
    datetime.now(shanghai_tz)
    
    import seaborn as sns
    sns.set_theme()
    
    proxies = {
        'http': 'http://127.0.0.1:1081',
        'https': 'http://127.0.0.1:1081',
    }
    
    def get_message_id(result):
        # 从消息链接中提取id
    
        link = result.select('[href^="https://t.me/serveruniverse/"]')[0]['href']
        message_id = link.split('/')[-1]
    
        return message_id
    
    def get_messages(url):
        response = requests.get(url, proxies=proxies)
        soup = BeautifulSoup(response.content, "html.parser")
        results = soup.select('.tgme_widget_message_wrap.js-widget_message_wrap')
        return results
    
    def parse_messages(url, stat = None):
        if(stat is None):
            # 初始化一个字典用于存储时间戳
            stat = {'nodeseek': [], 'hostloc': []}
            prev_message_ids = set()  # 用于存储上次循环中出现的消息id
        else:
            prev_message_ids =  set(message_id for (date, message_id) in stat['nodeseek'])
            prev_message_ids |= set(message_id for (date, message_id) in stat['hostloc'])
    
        while True:
            try:
                results = get_messages(url)
    
                if str(results[0]).find('No posts found') != -1:
                    break
                    
                # 如果当前循环中的所有消息id都在上次循环中出现过,则结束
                current_message_ids = {get_message_id(result) for result in results}
                print('current id', str(max(current_message_ids)), '/', str(max_id))
                if current_message_ids.issubset(prev_message_ids):
                    break
    
                # 遍历每个消息
                for result in results:
                    # 检查消息来源是 nodeseek 还是 hostloc
                    if str(result).find('#nodeseek') != -1:
                        source = 'nodeseek'
                    elif str(result).find('#hostloc') != -1:
                        source = 'hostloc'
                    else:
                        # other source
                        continue
    
                        
                    # 提取消息id
                    message_id = get_message_id(result)
    
                    # 如果消息id小于等于初始消息id,跳过,确保不重复统计
                    if message_id in prev_message_ids:
                        continue
    
                    # 提取时间戳字符串
                    date_string = result.select('[href^="https://t.me/serveruniverse/"] time')[0]['datetime']
    
                    # 将时间戳字符串转换为日期
                    date = datetime.fromisoformat(date_string).astimezone(shanghai_tz).strftime('%Y-%m-%d')
    
                    # 将日期和来源添加到对应的列表中
                    stat[source].append((date, message_id))
                    
    
                # 更新url为下一页链接
                url = 'https://t.me/s/serveruniverse/?after=' + max(current_message_ids)
    
                # 更新上次循环中出现的消息id集合
                prev_message_ids.update(current_message_ids)
            except Exception as e:
                print(e)
                continue
            except KeyboardInterrupt:
                break
            except:
                print('other error')
    
        return stat
    
    
    
    def plot_messages(stat):
        # 统计每天的消息数量
        daily_counts = defaultdict(lambda: {'nodeseek': 0, 'hostloc': 0})
        for source, messages in result_stat.items():
            for date, _ in messages:
                daily_counts[date][source] += 1
                
        # 提取日期和每个来源的消息数量
        today = datetime.now().astimezone(shanghai_tz).strftime('%Y-%m-%d')
    
        dates = sorted(daily_counts.keys())
        dates = [date for date in dates if daily_counts[date]['hostloc'] > 100 or date == today] # 过滤掉一些异常数值的日期
    
        nodeseek_counts = [daily_counts[date]['nodeseek'] for date in dates]
        hostloc_counts =  [daily_counts[date]['hostloc'] for date in dates]        
    
        # 绘制折线图
        plt.figure(figsize=(10, 8))
        plt.plot(dates, nodeseek_counts, label='nodeseek', marker='o')
        plt.plot(dates, hostloc_counts, label='hostloc', marker='o')
    
        plt.xlabel('Date')
        plt.ylabel('Number of Daily Articles')
        plt.title('Statistics of daily articles from hostloc and nodeseek')
        plt.legend()
        
        # 设置xticks的数量为20
        interval = max(1, len(dates) // 20)
        plt.xticks(dates[::-interval][::-1], rotation=45, ha='right')
       
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.axis([None, None, 0, 700])
    
        
    start_id = '1000'
    try:
        start_id = result_stat['hostloc'][len(result_stat['hostloc']) - 1][1]
    except Exception as e:
        result_stat = None
        print(e)
        pass
    
    url = 'https://t.me/s/serveruniverse'
    max_id = max(get_message_id(result) for result in get_messages(url))
    result_stat = parse_messages(url + '/?after=' + start_id, result_stat)
    # 绘制折线图
    plot_messages(result_stat)
    
12

你好啊,陌生人!

我的朋友,看起来你是新来的,如果想参与到讨论中,点击下面的按钮!

📈用户数目📈

目前论坛共有61855位seeker

🎉欢迎新用户🎉