File size: 2,327 Bytes
2cbffb5
 
7067c1e
 
 
2cbffb5
7067c1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2cbffb5
7067c1e
 
2cbffb5
 
 
 
 
 
7067c1e
 
 
2cbffb5
 
 
 
 
 
 
 
7067c1e
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
import torch
import gradio as gr
import requests
import bs4
from bs4 import BeautifulSoup

def get_text_from_url(url):

    headers = {
        'Accept-Language': 'en-US,en;q=0.9',
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        texto = soup.get_text()

        return texto
    else:

        print("Error al obtener la página:", response.status_code)
        return 'error'

classification_model_checkpoint = 'FrancoMartino/privacyPolicies_classification'
classification_tokenizer = AutoTokenizer.from_pretrained("FrancoMartino/privacyPolicies_classification")
classification_model = AutoModelForSequenceClassification.from_pretrained("FrancoMartino/privacyPolicies_classification")

summarization_model_checkpoint = "facebook/bart-large-cnn"
summarization_tokenizer = AutoTokenizer.from_pretrained(summarization_model_checkpoint)
summarization_model = AutoModelForSeq2SeqLM.from_pretrained(summarization_model_checkpoint)

def predict(url):
    text = get_text_from_url(url)
    if text == 'error':
      return {'ERROR': 'Error with the url'}
    if len(classification_tokenizer.tokenize(text)) > 4096:
      print('long')
      inputs = summarization_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
      with torch.no_grad():
          summary_ids = summarization_model.generate(inputs['input_ids'], max_length=4096)
      text = summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    inputs = classification_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=4096)
    with torch.no_grad():
        logits = classification_model(**inputs).logits
    probabilities = torch.softmax(logits, dim=1)
    prediction = probabilities[:,1].item()
    return {'Risk Indicator': prediction}

examples_urls = [
    ["https://help.instagram.com/155833707900388"],
    ["https://www.apple.com/legal/privacy/en-ww/"],
]

interface = gr.Interface(fn=predict, inputs="text",examples=examples_urls, outputs="label", title="Privacy Policy Risk Indicator", description="Enter a privacy policy URL to calculate risk.")
interface.launch()