-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
executable file
·245 lines (200 loc) · 8.62 KB
/
Copy pathapp.py
File metadata and controls
executable file
·245 lines (200 loc) · 8.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
#!/usr/bin/env python3
from datetime import datetime, timedelta
from modules.scraper import scrape
from modules.summarizer import summarize
from modules.tagging import tags_list
import os
import requests
import sys
import json
import argparse
from dotenv import load_dotenv, dotenv_values
load_dotenv()
POCKET_APP_NAME = os.environ.get("POCKET_APP_NAME")
consumer_key = os.environ.get("POCKET_CONSUMER_KEY")
access_token = os.environ.get("POCKET_ACCESS_TOKEN")
def last_week_timestamp()-> int:
"""
This function does not work anymore since Pocket's API does not respond to relative timestamps
(It ignores the "since" parameter)
"""
# Current time
now = datetime.now()
# Time 1 week ago from now
one_week_ago = now - timedelta(weeks=1)
# Convert to UNIX timestamp (and round down or convert to integer if necessary)
one_week_ago_timestamp = int(one_week_ago.timestamp())
return one_week_ago_timestamp
def format_timestamps(UNIX_TIMESTAMP) -> str:
"""
Convert UNIX timestamps to readable format
For use when formatting Pocket API results
"""
TS = int(UNIX_TIMESTAMP)
timestamp = datetime.fromtimestamp(TS).strftime('%Y-%m-%d %H:%M:%S')
return timestamp
def get_last_posts(one_week_ago_timestamp, consumer_key, access_token, results_counter=100):
url = "https://getpocket.com/v3/get"
headers = {
"Content-Type": "application/json"
}
payload = {
"consumer_key": consumer_key,
"access_token": access_token,
"count": str(results_counter),
"detailType": "complete"
}
response = requests.post(url, json=payload, headers=headers)
# Check if the request was successful
if response.status_code == 200:
return response # RETURN RAW TEXT
else:
print(f"[X] ERROR!: {response.status_code}")
sys.exit(1)
def retrieve_articles(res: dict[str], tag: str, results_counter: int):
# DEFAULT VALUE
continue_flag = 0
scraped_articles = []
"""
Regresa la lista completa
"""
CONTINUE_MESSAGE = \
"""
[?] Would you like to summarize this article?
- [y]es/ [n]o/ [f]inish and skip this one / Stop Asking and process [A]ll
>> """
wordcount = 0
# Control flow: User is asked to continue processing articles or not
counter = 0
for key, value in res.items():
if counter == results_counter + 1:
break
if isinstance(value, dict) and "tags" in value:
if tag not in value["tags"]:
continue
# If the item is archived:
elif str(value["status"]) == 1:
continue
# If the item is marked as deleted:
elif str(value["status"]) == 2:
continue
wordcount += int(value["word_count"])
# Time when article was added to Pocket
time_added = format_timestamps(value["time_added"])
# Time when article itself has been changed/updated
# Not very useful: any change (New comments, etc) influence this value
time_updated = format_timestamps(value["time_updated"])
article_id_list.append(value["item_id"])
# Print article information If user's previous selection was "y" or "n"
if continue_flag == 0:
print(f"****** Article #{str(counter+1)} ******")
print(f" - Time Added: {time_added}")
#print(f" - Time Updated: {time_updated}")
print(f" - ID: {value["item_id"]}")
# Not very useful; prone to errors:
print(f" - Is Article: {\
"yes" if int(value["is_article"]) == 1 else \
"no"}")
print(f" - Word Count: {value["word_count"]}")
print(f" - Language: {value["lang"]}")
print(f" - Title: {value["resolved_title"]}")
print(f" - Description (Excerpt): {"N/A" if not value["excerpt"] else value["excerpt"]}")
print(f" - URL: {value["resolved_url"]}")
print(f" - Video: {"Includes Video" if int(value["has_video"]) == 1 else ("Is a Video" if int(value["has_video"]) == 2 else "No")}")
if args.process:
# 0 = Skip current selection
# 1 = End with current selection, stop processing
# 2 = Process the rest, don't ask again
if continue_flag == 1:
continue
elif continue_flag == 0:
process_user_selection = input(CONTINUE_MESSAGE)
while process_user_selection not in ["y", "n", "f", "a"]:
process_user_selection = input(CONTINUE_MESSAGE)
elif process_user_selection == "a":
process_user_selection = "y"
continue_flag = 2
elif continue_flag == 2:
process_user_selection == "y"
if process_user_selection == "y":
try:
date, raw_article, title, authors = scrape(value["resolved_url"])
raw_content = ""
raw_content += f"Title: {title}\n"
raw_content += f"Authors: {authors}\n"
raw_content += f"Date: {date}\n"
raw_content += f"Word Count: {value['word_count']}\n"
raw_content += f"URL: {value['resolved_url']}\n"
raw_content += f"Content: {raw_article}"
scraped_articles.append(raw_content)
except Exception as e:
# Print in color the error:
print(f"\033[91m[X] ERROR Scraping this URL: {e}\033[0m")
elif process_user_selection == "n":
continue
elif process_user_selection == "f":
continue_flag = 1
counter += 1
else:
pass
return wordcount, scraped_articles
def argument_parser(tags_list:list[str]) -> list[str]:
parser = argparse.ArgumentParser(description='Pocket Wrapper App Help:')
# Parse arguments
parser.add_argument('--number', '-n', type=int, default=100, help='Number of articles to retrieve')
parser.add_argument('--list', '-l', nargs='?', const=True, default=None, help='List available tags')
parser.add_argument('--process', '-p', nargs='?', const=True, default=None, help='Complete the processing by LLM model (Default: False)')
parser.add_argument('--tag', '-t', help='Specify a tag')
# Argument Parsing
args = parser.parse_args()
# If no arguments are passed, print help:
if not args.number:
results_counter = 100
print("No limit specified. Defaulting to 100.")
else:
results_counter = args.number
if len(sys.argv) == 1:
parser.print_help()
sys.exit(0)
if args.list:
for tag in tags_list:
print(tag)
sys.exit(0)
elif args.tag:
tag = args.tag
return args, tag, results_counter
args, tag, results_counter = argument_parser(tags_list)
relative_time = last_week_timestamp()
raw = get_last_posts(relative_time, consumer_key, access_token, results_counter)
articles = raw.json()["list"]
# "list" is the actual value of the nested Dict that Pocket sends where the articles are listed.
#articles = json_obj["list"]
# PHASE 1: Retrieve Articles and Metadata in a list
article_id_list = []
wordcount, scraped_articles = retrieve_articles(articles, tag, results_counter)
if not args.process:
for article in scraped_articles:
print(article)
print(f"WC: {wordcount}")
print("---"*35)
print(f"TAG: {tag} | WC: {wordcount} | ARTICLES: {len(article_id_list)} | ARTICLES REQUESTED: {str(results_counter)}")
sys.exit(0)
if len(article_id_list) == 0:
print("[-] No results")
sys.exit(1)
# PHASE 2: Summarize & Get the TL;DR of every article and generate a "list of TLDRs"
TLDR_list = []
for raw_content in scraped_articles:
# Obtain a list of summarized articles by GPT
single_summary = summarize(raw_content, "tldr")
TLDR_list.append(single_summary)
# PHASE 3: Merge the summarized articles into one, and send to GPT to generate the overall digest of information
raw_summaries = ""
for summary in TLDR_list:
# Merge the summarized articles into one
raw_summaries += summary + "\n"
# Generate the final digest
final_digest = summarize(raw_summaries, "merge")
print(final_digest)
""" Useful for later: Archive after reading """
#archive_items(article_id_list)