I want a convert python to json

GaloGalo
March 2, 2023
211 views
0 votes
2 Answers

I want to convert python parser to json. Here’s a code of my python parser.

import re
import time
from urllib.parse import quote, unquote
from urllib.request import urlopen 
import requests
from bs4 import BeautifulSoup
url = "https://fasie.ru"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
div = soup.find_all('div', class_ = 'wrap')
programms_list = div[1].find('ul', class_='').find_all('ul', class_='')[1]
hrefs = programms_list.find_all('a')
download_links = set()
response = requests.get(url+'/programs')
parse_text = BeautifulSoup(response.text, 'html.parser')
links = set([x.get('href') for x in parse_text.find_all(href=re.compile('^/programs/'))])
def main():
    for h in hrefs:
        url_h = f"https://fasie.ru{h.get('href')}"
        page_h = urlopen(url_h)
        html_h = page_h.read().decode("utf-8")
        soup_h = BeautifulSoup(html_h, "html.parser")
        sections = soup_h.find_all('section')
        for s in sections:
            print(s.text)
    for link in links:
        response = requests.get(url+link)
        parse_text = BeautifulSoup(response.text, 'html.parser')
        download_links.update(set([x.get('href') for x in parse_text.find_all(href=re.compile('^/upload/docs'))]))
        for link in download_links:
            file_name = unquote(link).split('/')[-1]
            response = requests.get(url+quote(link))
            with open(file_name, 'wb') as f:
                f.write(response.content)
main()

And here’s what json should look like

[
  {
    "source": "Ссылка, откуда взята информация", // в данном случае ссылка fasie.ru
    "name": "ИнноШкольник",
    "description": "Информация из вкладки `О программе`",
    "program": "Данные из вкладки `Конкурсы, программы` в формате HTML",
    "contacts": [
      {
        "name": "Имя контакта",
        "tel": "Телефон",
        "email": "Почта контакта"
      }
    ],
    "documents": [
      {
        "source": "Ссылка на файл оригинальная, т.е откуда скачали",
        "path": "Относительный путь к файлу (уже скачанного)",
        "name": "Название файла",
        "extension": "Расширение файла (напр. pdf)",
        "size": 123 // Размер в байтах
      }
    ]
  }
]

It should be to create a bot that will automatically output information from the python code to a JSON

Answers

Chosen as BEST ANSWER

Found it. Thx to user510170

import json
import codecs
import re
import time
from urllib.parse import quote, unquote
from urllib.request import urlopen 
import requests
from bs4 import BeautifulSoup
from sys import getdefaultencoding
import yaml
import requests
import bleach

url = "https://fasie.ru"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
div = soup.find_all('div', class_ = 'wrap')
programms_list = div[1].find('ul', class_='').find_all('ul', class_='')[1]
hrefs = programms_list.find_all('a')
download_links = set()
response = requests.get(url+'/programs')
parse_text = BeautifulSoup(response.text, 'html.parser')
links = set([x.get('href') for x in parse_text.find_all(href=re.compile('^/programs/'))])
programs = []

def main():
    for h in hrefs:
        program = {}
        url_h = f"https://fasie.ru{h.get('href')}"
        page_h = urlopen(url_h)
        html_h = page_h.read().decode("utf-8")
        soup_h = BeautifulSoup(html_h, "html.parser")
        soup_b = BeautifulSoup(html_h, 'lxml')
        description = soup_h.find('section', {'id': 'content-tab3'})
        program['source'] = url_h
        program['name'] = h.text.strip()
        program['description'] = description.text.strip().replace('n', '').replace('t', '').replace('r', '') if description else ''
        program['program'] = str(soup_h.find('section', {'id': 'content-tab1'}).get_text()).replace('n', ' ').replace('t', ' ').replace('r', ' ')
        try:
            notag = soup_b.find('div', class_='tabs').find('section', id='content-tab5').find_all('tr') 
        except AttributeError:
            notag = soup_b.find('div', class_='tabs').find('section', id='content-tab4').find_all('p')
        for n in notag:
            nams = []
            tels = []
            emails = []
            contacts = []
            contact = {}
            try:
                nam=[i.text.replace('n', ' ').replace('t', ' ').replace('r', ' ').replace(u'xa0', u' ').replace('   ', '') for i in n.find_all('h4')]
                if nam==[]:
                    nam=[i.text.replace('n', ' ').replace('t', ' ').replace('r', ' ').replace(u'xa0', u' ').replace('   ', '') for i in n.find_all('b')]
                for i in nam:
                    if i==' ':
                        pass
                    else:
                        nams.append(i)
                email = n.find('a').text.replace('n', ' ').replace('t', ' ').replace('r', ' ').replace(' ', '')
                if email=='':
                    pass
                else:
                    emails.append(email)
                tel = [i.text.replace('n', ' ').replace('t', ' ').replace('r', ' ').replace(u'xa0', u' ').replace('   ', '') for i in n.find_all('nobr')]
                for i in tel:
                    if i=='':
                        pass
                    else:
                        tels.append(i)
            except AttributeError:
                pass
        contact['namе'] = nams
        contact['email'] = emails
        contact['tel'] = tels
        contacts.append(contact)
        program['contacts'] = contacts
            

        sections = soup_h.find_all('section')
        documents = []
        for s in sections:
            download_links.update(set([x.get('href') for x in s.find_all(href=re.compile('^/upload/docs'))]))
        for link in download_links:
            file_name = unquote(link).replace('%20', '').split('/')[-1]
            response = requests.get(url+quote(link))
            with open(file_name, 'wb') as f:
                f.write(response.content)
            document = {}
            document['source'] = url+link.replace('%20', ' ')
            document['path'] = file_name
            document['name'] = file_name
            document['extension'] = file_name.split('.')[-1]
            document['size'] = len(response.content)
            documents.append(document)
        program['documents'] = documents
        programs.append(program)
    with open('output.json', 'w', encoding="utf-8") as f:
        f.write(json.dumps(programs, indent=2, ensure_ascii=False))

main()

(Edit)

- Leoncini
- March 2, 2023 at 2:19 pm
- 0 votes
0
I don’t know if I understood what you meant but to "json stringify" an object you can just create the object in question, import the json library and use the dump method of the json module

Login or Signup to reply.

Please signup or login to give your own answer.

Click here to cancel reply.